On 05/15/2014 06:19 PM, Weston Andros Adamson wrote: > On May 15, 2014, at 5:12 PM, Anna Schumaker <schumaker.anna@xxxxxxxxx> wrote: > >> On 05/15/2014 11:56 AM, Weston Andros Adamson wrote: >>> Add "page groups" - a circular list of nfs requests (struct nfs_page) >>> that all reference the same page. This gives nfs read and write paths >>> the ability to account for sub-page regions independently. This >>> somewhat follows the design of struct buffer_head's sub-page >>> accounting. >>> >>> Only "head" requests are ever added/removed from the inode list in >>> the buffered write path. "head" and "sub" requests are treated the >>> same through the read path and the rest of the write/commit path. >>> Requests are given an extra reference across the life of the list. >>> >>> Page groups are never rejoined after being split. If the read/write >>> request fails and the client falls back to another path (ie revert >>> to MDS in PNFS case), the already split requests are pushed through >>> the recoalescing code again, which may split them further and then >>> coalesce them into properly sized requests on the wire. Fragmentation >>> shouldn't be a problem with the current design, because we flush all >>> requests in page group when a non-contiguous request is added, so >>> the only time resplitting should occur is on a resend of a read or >>> write. >>> >>> This patch lays the groundwork for sub-page splitting, but does not >>> actually do any splitting. For now all page groups have one request >>> as pg_test functions don't yet split pages. There are several related >>> patches that are needed support multiple requests per page group. >>> >>> Signed-off-by: Weston Andros Adamson <dros@xxxxxxxxxxxxxxx> >>> --- >>> fs/nfs/direct.c | 7 +- >>> fs/nfs/pagelist.c | 220 ++++++++++++++++++++++++++++++++++++++++++++--- >>> fs/nfs/read.c | 4 +- >>> fs/nfs/write.c | 13 ++- >>> include/linux/nfs_page.h | 13 ++- >>> 5 files changed, 236 insertions(+), 21 deletions(-) >>> >>> diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c >>> index 1dd8c62..2c0e08f 100644 >>> --- a/fs/nfs/direct.c >>> +++ b/fs/nfs/direct.c >>> @@ -380,7 +380,7 @@ static ssize_t nfs_direct_read_schedule_segment(struct nfs_pageio_descriptor *de >>> struct nfs_page *req; >>> unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); >>> /* XXX do we need to do the eof zeroing found in async_filler? */ >>> - req = nfs_create_request(dreq->ctx, pagevec[i], >>> + req = nfs_create_request(dreq->ctx, pagevec[i], NULL, >>> pgbase, req_len); >>> if (IS_ERR(req)) { >>> result = PTR_ERR(req); >>> @@ -749,7 +749,7 @@ static ssize_t nfs_direct_write_schedule_segment(struct nfs_pageio_descriptor *d >>> struct nfs_page *req; >>> unsigned int req_len = min_t(size_t, bytes, PAGE_SIZE - pgbase); >>> >>> - req = nfs_create_request(dreq->ctx, pagevec[i], >>> + req = nfs_create_request(dreq->ctx, pagevec[i], NULL, >>> pgbase, req_len); >>> if (IS_ERR(req)) { >>> result = PTR_ERR(req); >>> @@ -827,6 +827,8 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) >>> spin_unlock(&dreq->lock); >>> >>> while (!list_empty(&hdr->pages)) { >>> + bool do_destroy = true; >>> + >>> req = nfs_list_entry(hdr->pages.next); >>> nfs_list_remove_request(req); >>> switch (bit) { >>> @@ -834,6 +836,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) >>> case NFS_IOHDR_NEED_COMMIT: >>> kref_get(&req->wb_kref); >>> nfs_mark_request_commit(req, hdr->lseg, &cinfo); >>> + do_destroy = false; >>> } >>> nfs_unlock_and_release_request(req); >>> } >>> diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c >>> index a71655a..517c617 100644 >>> --- a/fs/nfs/pagelist.c >>> +++ b/fs/nfs/pagelist.c >>> @@ -29,6 +29,8 @@ >>> static struct kmem_cache *nfs_page_cachep; >>> static const struct rpc_call_ops nfs_pgio_common_ops; >>> >>> +static void nfs_free_request(struct nfs_page *); >>> + >>> static bool nfs_pgarray_set(struct nfs_page_array *p, unsigned int pagecount) >>> { >>> p->npages = pagecount; >>> @@ -136,10 +138,151 @@ nfs_iocounter_wait(struct nfs_io_counter *c) >>> return __nfs_iocounter_wait(c); >>> } >>> >>> +/* >>> + * nfs_page_group_lock - lock the head of the page group >>> + * @req - request in group that is to be locked >>> + * >>> + * this lock must be held if modifying the page group list >>> + */ >>> +void >>> +nfs_page_group_lock(struct nfs_page *req) >>> +{ >>> + struct nfs_page *head = req->wb_head; >>> + int err = -EAGAIN; >>> + >>> + WARN_ON_ONCE(head != head->wb_head); >>> + >>> + while (err) >>> + err = wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK, >>> + nfs_wait_bit_killable, TASK_KILLABLE); >>> +} >>> + >>> +/* >>> + * nfs_page_group_unlock - unlock the head of the page group >>> + * @req - request in group that is to be unlocked >>> + */ >>> +void >>> +nfs_page_group_unlock(struct nfs_page *req) >>> +{ >>> + struct nfs_page *head = req->wb_head; >>> + >>> + WARN_ON_ONCE(head != head->wb_head); >>> + >>> + smp_mb__before_clear_bit(); >>> + clear_bit(PG_HEADLOCK, &head->wb_flags); >>> + smp_mb__after_clear_bit(); >>> + wake_up_bit(&head->wb_flags, PG_HEADLOCK); >>> +} >>> + >>> +/* >>> + * nfs_page_group_sync_on_bit_locked >>> + * >>> + * must be called with page group lock held >>> + */ >>> +static bool >>> +nfs_page_group_sync_on_bit_locked(struct nfs_page *req, unsigned int bit) >>> +{ >>> + struct nfs_page *head = req->wb_head; >>> + struct nfs_page *tmp; >>> + >>> + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &head->wb_flags)); >>> + WARN_ON_ONCE(test_and_set_bit(bit, &req->wb_flags)); >>> + >>> + tmp = req->wb_this_page; >>> + while (tmp != req) { >>> + if (!test_bit(bit, &tmp->wb_flags)) >>> + return false; >>> + tmp = tmp->wb_this_page; >>> + } >>> + >>> + /* true! reset all bits */ >>> + tmp = req; >>> + do { >>> + clear_bit(bit, &tmp->wb_flags); >>> + tmp = tmp->wb_this_page; >>> + } while (tmp != req); >>> + >>> + return true; >>> +} >>> + >>> +/* >>> + * nfs_page_group_sync_on_bit - set bit on current request, but only >>> + * return true if the bit is set for all requests in page group >>> + * @req - request in page group >>> + * @bit - PG_* bit that is used to sync page group >>> + */ >>> +bool nfs_page_group_sync_on_bit(struct nfs_page *req, unsigned int bit) >>> +{ >>> + bool ret; >>> + >>> + nfs_page_group_lock(req); >>> + ret = nfs_page_group_sync_on_bit_locked(req, bit); >>> + nfs_page_group_unlock(req); >>> + >>> + return ret; >>> +} >>> + >>> +/* >>> + * nfs_page_group_init - Initialize the page group linkage for @req >>> + * @req - a new nfs request >>> + * @prev - the previous request in page group, or NULL if @req is the first >>> + * or only request in the group (the head). >>> + */ >>> +static inline void >>> +nfs_page_group_init(struct nfs_page *req, struct nfs_page *prev) >>> +{ >>> + WARN_ON_ONCE(prev == req); >>> + >>> + if (!prev) { >>> + req->wb_head = req; >>> + req->wb_this_page = req; >>> + } else { >>> + WARN_ON_ONCE(prev->wb_this_page != prev->wb_head); >>> + WARN_ON_ONCE(!test_bit(PG_HEADLOCK, &prev->wb_head->wb_flags)); >>> + req->wb_head = prev->wb_head; >>> + req->wb_this_page = prev->wb_this_page; >>> + prev->wb_this_page = req; >>> + >>> + /* grab extra ref if head request has extra ref from >>> + * the write/commit path to handle handoff between write >>> + * and commit lists */ >>> + if (test_bit(PG_INODE_REF, &prev->wb_head->wb_flags)) >>> + kref_get(&req->wb_kref); >>> + } >>> +} >>> + >>> +/* >>> + * nfs_page_group_destroy - sync the destruction of page groups >>> + * @req - request that no longer needs the page group >>> + * >>> + * releases the page group reference from each member once all >>> + * members have called this function. >>> + */ >>> +static void >>> +nfs_page_group_destroy(struct kref *kref) >>> +{ >>> + struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); >>> + struct nfs_page *tmp, *next; >>> + >>> + if (!nfs_page_group_sync_on_bit(req, PG_TEARDOWN)) >>> + return; >>> + >>> + tmp = req; >>> + do { >>> + next = tmp->wb_this_page; >>> + /* unlink and free */ >>> + tmp->wb_this_page = tmp; >>> + tmp->wb_head = tmp; >>> + nfs_free_request(tmp); >>> + tmp = next; >>> + } while (tmp != req); >>> +} >>> + >>> /** >>> * nfs_create_request - Create an NFS read/write request. >>> * @ctx: open context to use >>> * @page: page to write >>> + * @last: last nfs request created for this page group or NULL if head >>> * @offset: starting offset within the page for the write >>> * @count: number of bytes to read/write >>> * >>> @@ -149,7 +292,8 @@ nfs_iocounter_wait(struct nfs_io_counter *c) >>> */ >>> struct nfs_page * >>> nfs_create_request(struct nfs_open_context *ctx, struct page *page, >>> - unsigned int offset, unsigned int count) >>> + struct nfs_page *last, unsigned int offset, >>> + unsigned int count) >>> { >>> struct nfs_page *req; >>> struct nfs_lock_context *l_ctx; >>> @@ -181,6 +325,7 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page, >>> req->wb_bytes = count; >>> req->wb_context = get_nfs_open_context(ctx); >>> kref_init(&req->wb_kref); >>> + nfs_page_group_init(req, last); >>> return req; >>> } >>> >>> @@ -238,16 +383,18 @@ static void nfs_clear_request(struct nfs_page *req) >>> } >>> } >>> >>> - >>> /** >>> * nfs_release_request - Release the count on an NFS read/write request >>> * @req: request to release >>> * >>> * Note: Should never be called with the spinlock held! >>> */ >>> -static void nfs_free_request(struct kref *kref) >>> +static void nfs_free_request(struct nfs_page *req) >>> { >>> - struct nfs_page *req = container_of(kref, struct nfs_page, wb_kref); >>> + WARN_ON_ONCE(req->wb_this_page != req); >>> + >>> + /* extra debug: make sure no sync bits are still set */ >>> + WARN_ON_ONCE(test_bit(PG_TEARDOWN, &req->wb_flags)); >>> >>> /* Release struct file and open context */ >>> nfs_clear_request(req); >>> @@ -256,7 +403,7 @@ static void nfs_free_request(struct kref *kref) >>> >>> void nfs_release_request(struct nfs_page *req) >>> { >>> - kref_put(&req->wb_kref, nfs_free_request); >>> + kref_put(&req->wb_kref, nfs_page_group_destroy); >>> } >>> >>> static int nfs_wait_bit_uninterruptible(void *word) >>> @@ -833,21 +980,66 @@ static void nfs_pageio_doio(struct nfs_pageio_descriptor *desc) >>> * @desc: destination io descriptor >>> * @req: request >>> * >>> + * This may split a request into subrequests which are all part of the >>> + * same page group. >>> + * >>> * Returns true if the request 'req' was successfully coalesced into the >>> * existing list of pages 'desc'. >>> */ >>> static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc, >>> struct nfs_page *req) >>> { >>> - while (!nfs_pageio_do_add_request(desc, req)) { >>> - desc->pg_moreio = 1; >>> - nfs_pageio_doio(desc); >>> - if (desc->pg_error < 0) >>> - return 0; >>> - desc->pg_moreio = 0; >>> - if (desc->pg_recoalesce) >>> - return 0; >>> - } >>> + struct nfs_page *subreq; >>> + unsigned int bytes_left = 0; >>> + unsigned int offset, pgbase; >>> + >>> + nfs_page_group_lock(req); >>> + >>> + subreq = req; >>> + bytes_left = subreq->wb_bytes; >>> + offset = subreq->wb_offset; >>> + pgbase = subreq->wb_pgbase; >>> + >>> + do { >>> + if (!nfs_pageio_do_add_request(desc, subreq)) { >>> + /* make sure pg_test call(s) did nothing */ >>> + WARN_ON_ONCE(subreq->wb_bytes != bytes_left); >>> + WARN_ON_ONCE(subreq->wb_offset != offset); >>> + WARN_ON_ONCE(subreq->wb_pgbase != pgbase); >>> + >>> + nfs_page_group_unlock(req); >>> + desc->pg_moreio = 1; >>> + nfs_pageio_doio(desc); >>> + if (desc->pg_error < 0) >>> + return 0; >>> + desc->pg_moreio = 0; >>> + if (desc->pg_recoalesce) >>> + return 0; >> Would it make sense to 'or' together the desc->pg_error and desc->pg_recoalesce checks, rather than having two distinct if statements with the same return value? >> >> Anna >> > That�s a copy-paste from the original __nfs_pageio_add_request. I didn�t combine the > statements, because desc->pg_moreio is set to 0 iff desc->pg_error < 0 evaluates to false. > > We might be able to clean this up, but it�s not just as simple as combining them into one > statement. Fair enough! I must have missed the copy-and-paste part. Anna > > -dros > >>> + /* retry add_request for this subreq */ >>> + nfs_page_group_lock(req); >>> + continue; >>> + } >>> + >>> + /* check for buggy pg_test call(s) */ >>> + WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE); >>> + WARN_ON_ONCE(subreq->wb_bytes > bytes_left); >>> + WARN_ON_ONCE(subreq->wb_bytes == 0); >>> + >>> + bytes_left -= subreq->wb_bytes; >>> + offset += subreq->wb_bytes; >>> + pgbase += subreq->wb_bytes; >>> + >>> + if (bytes_left) { >>> + subreq = nfs_create_request(req->wb_context, >>> + req->wb_page, >>> + subreq, pgbase, bytes_left); >>> + nfs_lock_request(subreq); >>> + subreq->wb_offset = offset; >>> + subreq->wb_index = req->wb_index; >>> + } >>> + } while (bytes_left > 0); >>> + >>> + nfs_page_group_unlock(req); >>> return 1; >>> } >>> >>> diff --git a/fs/nfs/read.c b/fs/nfs/read.c >>> index 46d9044..902ba2c 100644 >>> --- a/fs/nfs/read.c >>> +++ b/fs/nfs/read.c >>> @@ -85,7 +85,7 @@ int nfs_readpage_async(struct nfs_open_context *ctx, struct inode *inode, >>> len = nfs_page_length(page); >>> if (len == 0) >>> return nfs_return_empty_page(page); >>> - new = nfs_create_request(ctx, page, 0, len); >>> + new = nfs_create_request(ctx, page, NULL, 0, len); >>> if (IS_ERR(new)) { >>> unlock_page(page); >>> return PTR_ERR(new); >>> @@ -311,7 +311,7 @@ readpage_async_filler(void *data, struct page *page) >>> if (len == 0) >>> return nfs_return_empty_page(page); >>> >>> - new = nfs_create_request(desc->ctx, page, 0, len); >>> + new = nfs_create_request(desc->ctx, page, NULL, 0, len); >>> if (IS_ERR(new)) >>> goto out_error; >>> >>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c >>> index e773df2..d0f30f1 100644 >>> --- a/fs/nfs/write.c >>> +++ b/fs/nfs/write.c >>> @@ -367,6 +367,8 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) >>> { >>> struct nfs_inode *nfsi = NFS_I(inode); >>> >>> + WARN_ON_ONCE(req->wb_this_page != req); >>> + >>> /* Lock the request! */ >>> nfs_lock_request(req); >>> >>> @@ -383,6 +385,7 @@ static void nfs_inode_add_request(struct inode *inode, struct nfs_page *req) >>> set_page_private(req->wb_page, (unsigned long)req); >>> } >>> nfsi->npages++; >>> + set_bit(PG_INODE_REF, &req->wb_flags); >>> kref_get(&req->wb_kref); >>> spin_unlock(&inode->i_lock); >>> } >>> @@ -567,6 +570,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) >>> { >>> struct nfs_commit_info cinfo; >>> unsigned long bytes = 0; >>> + bool do_destroy; >>> >>> if (test_bit(NFS_IOHDR_REDO, &hdr->flags)) >>> goto out; >>> @@ -596,6 +600,7 @@ remove_req: >>> next: >>> nfs_unlock_request(req); >>> nfs_end_page_writeback(req->wb_page); >>> + do_destroy = !test_bit(NFS_IOHDR_NEED_COMMIT, &hdr->flags); >>> nfs_release_request(req); >>> } >>> out: >>> @@ -700,6 +705,10 @@ static struct nfs_page *nfs_try_to_update_request(struct inode *inode, >>> if (req == NULL) >>> goto out_unlock; >>> >>> + /* should be handled by nfs_flush_incompatible */ >>> + WARN_ON_ONCE(req->wb_head != req); >>> + WARN_ON_ONCE(req->wb_this_page != req); >>> + >>> rqend = req->wb_offset + req->wb_bytes; >>> /* >>> * Tell the caller to flush out the request if >>> @@ -761,7 +770,7 @@ static struct nfs_page * nfs_setup_write_request(struct nfs_open_context* ctx, >>> req = nfs_try_to_update_request(inode, page, offset, bytes); >>> if (req != NULL) >>> goto out; >>> - req = nfs_create_request(ctx, page, offset, bytes); >>> + req = nfs_create_request(ctx, page, NULL, offset, bytes); >>> if (IS_ERR(req)) >>> goto out; >>> nfs_inode_add_request(inode, req); >>> @@ -805,6 +814,8 @@ int nfs_flush_incompatible(struct file *file, struct page *page) >>> return 0; >>> l_ctx = req->wb_lock_context; >>> do_flush = req->wb_page != page || req->wb_context != ctx; >>> + /* for now, flush if more than 1 request in page_group */ >>> + do_flush |= req->wb_this_page != req; >>> if (l_ctx && ctx->dentry->d_inode->i_flock != NULL) { >>> do_flush |= l_ctx->lockowner.l_owner != current->files >>> || l_ctx->lockowner.l_pid != current->tgid; >>> diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h >>> index 13d59af..986c0c2 100644 >>> --- a/include/linux/nfs_page.h >>> +++ b/include/linux/nfs_page.h >>> @@ -26,6 +26,9 @@ enum { >>> PG_MAPPED, /* page private set for buffered io */ >>> PG_CLEAN, /* write succeeded */ >>> PG_COMMIT_TO_DS, /* used by pnfs layouts */ >>> + PG_INODE_REF, /* extra ref held by inode (head req only) */ >>> + PG_HEADLOCK, /* page group lock of wb_head */ >>> + PG_TEARDOWN, /* page group sync for destroy */ >>> }; >>> >>> struct nfs_inode; >>> @@ -41,6 +44,8 @@ struct nfs_page { >>> struct kref wb_kref; /* reference count */ >>> unsigned long wb_flags; >>> struct nfs_write_verifier wb_verf; /* Commit cookie */ >>> + struct nfs_page *wb_this_page; /* list of reqs for this page */ >>> + struct nfs_page *wb_head; /* head pointer for req list */ >>> }; >>> >>> struct nfs_pageio_descriptor; >>> @@ -87,9 +92,10 @@ struct nfs_pageio_descriptor { >>> >>> extern struct nfs_page *nfs_create_request(struct nfs_open_context *ctx, >>> struct page *page, >>> + struct nfs_page *last, >>> unsigned int offset, >>> unsigned int count); >>> -extern void nfs_release_request(struct nfs_page *req); >>> +extern void nfs_release_request(struct nfs_page *); >>> >>> >>> extern void nfs_pageio_init(struct nfs_pageio_descriptor *desc, >>> @@ -108,7 +114,10 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, >>> struct nfs_page *req); >>> extern int nfs_wait_on_request(struct nfs_page *); >>> extern void nfs_unlock_request(struct nfs_page *req); >>> -extern void nfs_unlock_and_release_request(struct nfs_page *req); >>> +extern void nfs_unlock_and_release_request(struct nfs_page *); >>> +extern void nfs_page_group_lock(struct nfs_page *); >>> +extern void nfs_page_group_unlock(struct nfs_page *); >>> +extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int); >>> >>> /* >>> * Lock the page of an asynchronous request -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html