Re: [PATCH v3 06/18] nfs: add support for multiple nfs reqs per page

Anna Schumaker <schumaker.anna@xxxxxxxxx> · Fri, 16 May 2014 08:58:00 -0400

On 05/15/2014 06:19 PM, Weston Andros Adamson wrote:
> On May 15, 2014, at 5:12 PM, Anna Schumaker <schumaker.anna@xxxxxxxxx> wrote:
>
>> On 05/15/2014 11:56 AM, Weston Andros Adamson wrote:
>>> Add "page groups" - a circular list of nfs requests (struct nfs_page)
>>> that all reference the same page. This gives nfs read and write paths
>>> the ability to account for sub-page regions independently.  This
>>> somewhat follows the design of struct buffer_head's sub-page
>>> accounting.
>>>
>>> Only "head" requests are ever added/removed from the inode list in
>>> the buffered write path. "head" and "sub" requests are treated the
>>> same through the read path and the rest of the write/commit path.
>>> Requests are given an extra reference across the life of the list.
>>>
>>> Page groups are never rejoined after being split. If the read/write
>>> request fails and the client falls back to another path (ie revert
>>> to MDS in PNFS case), the already split requests are pushed through
>>> the recoalescing code again, which may split them further and then
>>> coalesce them into properly sized requests on the wire. Fragmentation
>>> shouldn't be a problem with the current design, because we flush all
>>> requests in page group when a non-contiguous request is added, so
>>> the only time resplitting should occur is on a resend of a read or
>>> write.
>>>
>>> This patch lays the groundwork for sub-page splitting, but does not
>>> actually do any splitting. For now all page groups have one request
>>> as pg_test functions don't yet split pages. There are several related
>>> patches that are needed support multiple requests per page group.
>>>
>>> Signed-off-by: Weston Andros Adamson <dros@xxxxxxxxxxxxxxx>
>>> ---
>>> fs/nfs/direct.c          |   7 +-
>>> fs/nfs/pagelist.c        | 220 ++++++++++++++++++++++++++++++++++++++++++++---
>>> fs/nfs/read.c            |   4 +-
>>> fs/nfs/write.c           |  13 ++-
>>> include/linux/nfs_page.h |  13 ++-
>>> 5 files changed, 236 insertions(+), 21 deletions(-)
>>>
>>> diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
>>> index 1dd8c62..2c0e08f 100644
>>> --- a/fs/nfs/direct.c
>>> +++ b/fs/nfs/direct.c
>>> @@ -380,7 +380,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de
>>> 			struct nfs_page *req;
>>> 			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
>>> 			/* XXX do we need to do the eof zeroing found in async_filler? */
>>> -			req = nfs_create_request(dreq->ctx, pagevec[i],
>>> +			req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
>>> 						 pgbase, req_len);
>>> 			if (IS_ERR(req)) {
>>> 				result = PTR_ERR(req);
>>> @@ -749,7 +749,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d
>>> 			struct nfs_page *req;
>>> 			unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase);
>>>
>>> -			req = nfs_create_request(dreq->ctx, pagevec[i],
>>> +			req = nfs_create_request(dreq->ctx, pagevec[i], NULL,
>>> 						 pgbase, req_len);
>>> 			if (IS_ERR(req)) {
>>> 				result = PTR_ERR(req);
>>> @@ -827,6 +827,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
>>> 	spin_unlock(&dreq->lock);
>>>
>>> 	while (!list_empty(&hdr->pages)) {
>>> +		bool do_destroy = true;
>>> +
>>> 		req = nfs_list_entry(hdr->pages.next);
>>> 		nfs_list_remove_request(req);
>>> 		switch (bit) {
>>> @@ -834,6 +836,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
>>> 		case NFS_IOHDR_NEED_COMMIT:
>>> 			kref_get(&req->wb_kref);
>>> 			nfs_mark_request_commit(req, hdr->lseg, &cinfo);
>>> +			do_destroy = false;
>>> 		}
>>> 		nfs_unlock_and_release_request(req);
>>> 	}
>>> diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
>>> index a71655a..517c617 100644
>>> --- a/fs/nfs/pagelist.c
>>> +++ b/fs/nfs/pagelist.c
>>> @@ -29,6 +29,8 @@
>>> static struct kmem_cache *nfs_page_cachep;
>>> static const struct rpc_call_ops nfs_pgio_common_ops;
>>>
>>> +static void nfs_free_request(struct nfs_page *);
>>> +
>>> static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount)
>>> {
>>> 	p->npages = pagecount;
>>> @@ -136,10 +138,151 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
>>> 	return __nfs_iocounter_wait(c);
>>> }
>>>
>>> +/*
>>> + * nfs_page_group_lock - lock the head of the page group
>>> + * @req - request in group that is to be locked
>>> + *
>>> + * this lock must be held if modifying the page group list
>>> + */
>>> +void
>>> +nfs_page_group_lock(struct nfs_page *req)
>>> +{
>>> +	struct nfs_page *head = req->wb_head;
>>> +	int err = -EAGAIN;
>>> +
>>> +	WARN_ON_ONCE(head != head->wb_head);
>>> +
>>> +	while (err)
>>> +		err = wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
>>> +			nfs_wait_bit_killable, TASK_KILLABLE);
>>> +}
>>> +
>>> +/*
>>> + * nfs_page_group_unlock - unlock the head of the page group
>>> + * @req - request in group that is to be unlocked
>>> + */
>>> +void
>>> +nfs_page_group_unlock(struct nfs_page *req)
>>> +{
>>> +	struct nfs_page *head = req->wb_head;
>>> +
>>> +	WARN_ON_ONCE(head != head->wb_head);
>>> +
>>> +	smp_mb__before_clear_bit();
>>> +	clear_bit(PG_HEADLOCK, &head->wb_flags);
>>> +	smp_mb__after_clear_bit();
>>> +	wake_up_bit(&head->wb_flags, PG_HEADLOCK);
>>> +}
>>> +
>>> +/*
>>> + * nfs_page_group_sync_on_bit_locked
>>> + *
>>> + * must be called with page group lock held
>>> + */
>>> +static bool
>>> +nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit)
>>> +{
>>> +	struct nfs_page *head = req->wb_head;
>>> +	struct nfs_page *tmp;
>>> +
>>> +	WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags));
>>> +	WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags));
>>> +
>>> +	tmp = req->wb_this_page;
>>> +	while (tmp != req) {
>>> +		if (!test_bit(bit, &tmp->wb_flags))
>>> +			return false;
>>> +		tmp = tmp->wb_this_page;
>>> +	}
>>> +
>>> +	/* true! reset all bits */
>>> +	tmp = req;
>>> +	do {
>>> +		clear_bit(bit, &tmp->wb_flags);
>>> +		tmp = tmp->wb_this_page;
>>> +	} while (tmp != req);
>>> +
>>> +	return true;
>>> +}
>>> +
>>> +/*
>>> + * nfs_page_group_sync_on_bit - set bit on current request, but only
>>> + *   return true if the bit is set for all requests in page group
>>> + * @req - request in page group
>>> + * @bit - PG_* bit that is used to sync page group
>>> + */
>>> +bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit)
>>> +{
>>> +	bool ret;
>>> +
>>> +	nfs_page_group_lock(req);
>>> +	ret = nfs_page_group_sync_on_bit_locked(req, bit);
>>> +	nfs_page_group_unlock(req);
>>> +
>>> +	return ret;
>>> +}
>>> +
>>> +/*
>>> + * nfs_page_group_init - Initialize the page group linkage for @req
>>> + * @req - a new nfs request
>>> + * @prev - the previous request in page group, or NULL if @req is the first
>>> + *         or only request in the group (the head).
>>> + */
>>> +static inline void
>>> +nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev)
>>> +{
>>> +	WARN_ON_ONCE(prev == req);
>>> +
>>> +	if (!prev) {
>>> +		req->wb_head = req;
>>> +		req->wb_this_page = req;
>>> +	} else {
>>> +		WARN_ON_ONCE(prev->wb_this_page != prev->wb_head);
>>> +		WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags));
>>> +		req->wb_head = prev->wb_head;
>>> +		req->wb_this_page = prev->wb_this_page;
>>> +		prev->wb_this_page = req;
>>> +
>>> +		/* grab extra ref if head request has extra ref from
>>> +		 * the write/commit path to handle handoff between write
>>> +		 * and commit lists */
>>> +		if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags))
>>> +			kref_get(&req->wb_kref);
>>> +	}
>>> +}
>>> +
>>> +/*
>>> + * nfs_page_group_destroy - sync the destruction of page groups
>>> + * @req - request that no longer needs the page group
>>> + *
>>> + * releases the page group reference from each member once all
>>> + * members have called this function.
>>> + */
>>> +static void
>>> +nfs_page_group_destroy(struct kref *kref)
>>> +{
>>> +	struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
>>> +	struct nfs_page *tmp, *next;
>>> +
>>> +	if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN))
>>> +		return;
>>> +
>>> +	tmp = req;
>>> +	do {
>>> +		next = tmp->wb_this_page;
>>> +		/* unlink and free */
>>> +		tmp->wb_this_page = tmp;
>>> +		tmp->wb_head = tmp;
>>> +		nfs_free_request(tmp);
>>> +		tmp = next;
>>> +	} while (tmp != req);
>>> +}
>>> +
>>> /**
>>>  * nfs_create_request - Create an NFS read/write request.
>>>  * @ctx: open context to use
>>>  * @page: page to write
>>> + * @last: last nfs request created for this page group or NULL if head
>>>  * @offset: starting offset within the page for the write
>>>  * @count: number of bytes to read/write
>>>  *
>>> @@ -149,7 +292,8 @@ nfs_iocounter_wait(struct nfs_io_counter *c)
>>>  */
>>> struct nfs_page *
>>> nfs_create_request(struct nfs_open_context *ctx, struct page *page,
>>> -		   unsigned int offset, unsigned int count)
>>> +		   struct nfs_page *last, unsigned int offset,
>>> +		   unsigned int count)
>>> {
>>> 	struct nfs_page		*req;
>>> 	struct nfs_lock_context *l_ctx;
>>> @@ -181,6 +325,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
>>> 	req->wb_bytes   = count;
>>> 	req->wb_context = get_nfs_open_context(ctx);
>>> 	kref_init(&req->wb_kref);
>>> +	nfs_page_group_init(req, last);
>>> 	return req;
>>> }
>>>
>>> @@ -238,16 +383,18 @@ static void nfs_clear_request(struct nfs_page *req)
>>> 	}
>>> }
>>>
>>> -
>>> /**
>>>  * nfs_release_request - Release the count on an NFS read/write request
>>>  * @req: request to release
>>>  *
>>>  * Note: Should never be called with the spinlock held!
>>>  */
>>> -static void nfs_free_request(struct kref *kref)
>>> +static void nfs_free_request(struct nfs_page *req)
>>> {
>>> -	struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref);
>>> +	WARN_ON_ONCE(req->wb_this_page != req);
>>> +
>>> +	/* extra debug: make sure no sync bits are still set */
>>> +	WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags));
>>>
>>> 	/* Release struct file and open context */
>>> 	nfs_clear_request(req);
>>> @@ -256,7 +403,7 @@ static void nfs_free_request(struct kref *kref)
>>>
>>> void nfs_release_request(struct nfs_page *req)
>>> {
>>> -	kref_put(&req->wb_kref, nfs_free_request);
>>> +	kref_put(&req->wb_kref, nfs_page_group_destroy);
>>> }
>>>
>>> static int nfs_wait_bit_uninterruptible(void *word)
>>> @@ -833,21 +980,66 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc)
>>>  * @desc: destination io descriptor
>>>  * @req: request
>>>  *
>>> + * This may split a request into subrequests which are all part of the
>>> + * same page group.
>>> + *
>>>  * Returns true if the request 'req' was successfully coalesced into the
>>>  * existing list of pages 'desc'.
>>>  */
>>> static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
>>> 			   struct nfs_page *req)
>>> {
>>> -	while (!nfs_pageio_do_add_request(desc, req)) {
>>> -		desc->pg_moreio = 1;
>>> -		nfs_pageio_doio(desc);
>>> -		if (desc->pg_error < 0)
>>> -			return 0;
>>> -		desc->pg_moreio = 0;
>>> -		if (desc->pg_recoalesce)
>>> -			return 0;
>>> -	}
>>> +	struct nfs_page *subreq;
>>> +	unsigned int bytes_left = 0;
>>> +	unsigned int offset, pgbase;
>>> +
>>> +	nfs_page_group_lock(req);
>>> +
>>> +	subreq = req;
>>> +	bytes_left = subreq->wb_bytes;
>>> +	offset = subreq->wb_offset;
>>> +	pgbase = subreq->wb_pgbase;
>>> +
>>> +	do {
>>> +		if (!nfs_pageio_do_add_request(desc, subreq)) {
>>> +			/* make sure pg_test call(s) did nothing */
>>> +			WARN_ON_ONCE(subreq->wb_bytes != bytes_left);
>>> +			WARN_ON_ONCE(subreq->wb_offset != offset);
>>> +			WARN_ON_ONCE(subreq->wb_pgbase != pgbase);
>>> +
>>> +			nfs_page_group_unlock(req);
>>> +			desc->pg_moreio = 1;
>>> +			nfs_pageio_doio(desc);
>>> +			if (desc->pg_error < 0)
>>> +				return 0;
>>> +			desc->pg_moreio = 0;
>>> +			if (desc->pg_recoalesce)
>>> +				return 0;
>> Would it make sense to 'or' together the desc->pg_error and desc->pg_recoalesce checks, rather than having two distinct if statements with the same return value?
>>
>> Anna
>>
> That�s a copy-paste from the original __nfs_pageio_add_request. I didn�t combine the
> statements, because desc->pg_moreio is set to 0 iff desc->pg_error < 0 evaluates to false. 
>
> We might be able to clean this up, but it�s not just as simple as combining them into one
> statement.

Fair enough! I must have missed the copy-and-paste part.

Anna
>
> -dros
>
>>> +			/* retry add_request for this subreq */
>>> +			nfs_page_group_lock(req);
>>> +			continue;
>>> +		}
>>> +
>>> +		/* check for buggy pg_test call(s) */
>>> +		WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE);
>>> +		WARN_ON_ONCE(subreq->wb_bytes > bytes_left);
>>> +		WARN_ON_ONCE(subreq->wb_bytes == 0);
>>> +
>>> +		bytes_left -= subreq->wb_bytes;
>>> +		offset += subreq->wb_bytes;
>>> +		pgbase += subreq->wb_bytes;
>>> +
>>> +		if (bytes_left) {
>>> +			subreq = nfs_create_request(req->wb_context,
>>> +					req->wb_page,
>>> +					subreq, pgbase, bytes_left);
>>> +			nfs_lock_request(subreq);
>>> +			subreq->wb_offset  = offset;
>>> +			subreq->wb_index = req->wb_index;
>>> +		}
>>> +	} while (bytes_left > 0);
>>> +
>>> +	nfs_page_group_unlock(req);
>>> 	return 1;
>>> }
>>>
>>> diff --git a/fs/nfs/read.c b/fs/nfs/read.c
>>> index 46d9044..902ba2c 100644
>>> --- a/fs/nfs/read.c
>>> +++ b/fs/nfs/read.c
>>> @@ -85,7 +85,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode,
>>> 	len = nfs_page_length(page);
>>> 	if (len == 0)
>>> 		return nfs_return_empty_page(page);
>>> -	new = nfs_create_request(ctx, page, 0, len);
>>> +	new = nfs_create_request(ctx, page, NULL, 0, len);
>>> 	if (IS_ERR(new)) {
>>> 		unlock_page(page);
>>> 		return PTR_ERR(new);
>>> @@ -311,7 +311,7 @@ readpage_async_filler(void *data, struct page *page)
>>> 	if (len == 0)
>>> 		return nfs_return_empty_page(page);
>>>
>>> -	new = nfs_create_request(desc->ctx, page, 0, len);
>>> +	new = nfs_create_request(desc->ctx, page, NULL, 0, len);
>>> 	if (IS_ERR(new))
>>> 		goto out_error;
>>>
>>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c
>>> index e773df2..d0f30f1 100644
>>> --- a/fs/nfs/write.c
>>> +++ b/fs/nfs/write.c
>>> @@ -367,6 +367,8 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
>>> {
>>> 	struct nfs_inode *nfsi = NFS_I(inode);
>>>
>>> +	WARN_ON_ONCE(req->wb_this_page != req);
>>> +
>>> 	/* Lock the request! */
>>> 	nfs_lock_request(req);
>>>
>>> @@ -383,6 +385,7 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req)
>>> 		set_page_private(req->wb_page, (unsigned long)req);
>>> 	}
>>> 	nfsi->npages++;
>>> +	set_bit(PG_INODE_REF, &req->wb_flags);
>>> 	kref_get(&req->wb_kref);
>>> 	spin_unlock(&inode->i_lock);
>>> }
>>> @@ -567,6 +570,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr)
>>> {
>>> 	struct nfs_commit_info cinfo;
>>> 	unsigned long bytes = 0;
>>> +	bool do_destroy;
>>>
>>> 	if (test_bit(NFS_IOHDR_REDO, &hdr->flags))
>>> 		goto out;
>>> @@ -596,6 +600,7 @@ remove_req:
>>> next:
>>> 		nfs_unlock_request(req);
>>> 		nfs_end_page_writeback(req->wb_page);
>>> +		do_destroy = !test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags);
>>> 		nfs_release_request(req);
>>> 	}
>>> out:
>>> @@ -700,6 +705,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode,
>>> 		if (req == NULL)
>>> 			goto out_unlock;
>>>
>>> +		/* should be handled by nfs_flush_incompatible */
>>> +		WARN_ON_ONCE(req->wb_head != req);
>>> +		WARN_ON_ONCE(req->wb_this_page != req);
>>> +
>>> 		rqend = req->wb_offset + req->wb_bytes;
>>> 		/*
>>> 		 * Tell the caller to flush out the request if
>>> @@ -761,7 +770,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx,
>>> 	req = nfs_try_to_update_request(inode, page, offset, bytes);
>>> 	if (req != NULL)
>>> 		goto out;
>>> -	req = nfs_create_request(ctx, page, offset, bytes);
>>> +	req = nfs_create_request(ctx, page, NULL, offset, bytes);
>>> 	if (IS_ERR(req))
>>> 		goto out;
>>> 	nfs_inode_add_request(inode, req);
>>> @@ -805,6 +814,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page)
>>> 			return 0;
>>> 		l_ctx = req->wb_lock_context;
>>> 		do_flush = req->wb_page != page || req->wb_context != ctx;
>>> +		/* for now, flush if more than 1 request in page_group */
>>> +		do_flush |= req->wb_this_page != req;
>>> 		if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) {
>>> 			do_flush |= l_ctx->lockowner.l_owner != current->files
>>> 				|| l_ctx->lockowner.l_pid != current->tgid;
>>> diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
>>> index 13d59af..986c0c2 100644
>>> --- a/include/linux/nfs_page.h
>>> +++ b/include/linux/nfs_page.h
>>> @@ -26,6 +26,9 @@ enum {
>>> 	PG_MAPPED,		/* page private set for buffered io */
>>> 	PG_CLEAN,		/* write succeeded */
>>> 	PG_COMMIT_TO_DS,	/* used by pnfs layouts */
>>> +	PG_INODE_REF,		/* extra ref held by inode (head req only) */
>>> +	PG_HEADLOCK,		/* page group lock of wb_head */
>>> +	PG_TEARDOWN,		/* page group sync for destroy */
>>> };
>>>
>>> struct nfs_inode;
>>> @@ -41,6 +44,8 @@ struct nfs_page {
>>> 	struct kref		wb_kref;	/* reference count */
>>> 	unsigned long		wb_flags;
>>> 	struct nfs_write_verifier	wb_verf;	/* Commit cookie */
>>> +	struct nfs_page		*wb_this_page;  /* list of reqs for this page */
>>> +	struct nfs_page		*wb_head;       /* head pointer for req list */
>>> };
>>>
>>> struct nfs_pageio_descriptor;
>>> @@ -87,9 +92,10 @@ struct nfs_pageio_descriptor {
>>>
>>> extern	struct nfs_page *nfs_create_request(struct nfs_open_context *ctx,
>>> 					    struct page *page,
>>> +					    struct nfs_page *last,
>>> 					    unsigned int offset,
>>> 					    unsigned int count);
>>> -extern	void nfs_release_request(struct nfs_page *req);
>>> +extern	void nfs_release_request(struct nfs_page *);
>>>
>>>
>>> extern	void nfs_pageio_init(struct nfs_pageio_descriptor *desc,
>>> @@ -108,7 +114,10 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
>>> 				struct nfs_page *req);
>>> extern  int nfs_wait_on_request(struct nfs_page *);
>>> extern	void nfs_unlock_request(struct nfs_page *req);
>>> -extern	void nfs_unlock_and_release_request(struct nfs_page *req);
>>> +extern	void nfs_unlock_and_release_request(struct nfs_page *);
>>> +extern void nfs_page_group_lock(struct nfs_page *);
>>> +extern void nfs_page_group_unlock(struct nfs_page *);
>>> +extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
>>>
>>> /*
>>>  * Lock the page of an asynchronous request

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html