On 2011-03-23 22:30, Fred Isaman wrote: > 2011/3/23 Benny Halevy <bhalevy@xxxxxxxxxxx>: >> On 2011-03-23 15:27, Fred Isaman wrote: >>> >>> We create three major hooks for the pnfs code. >>> >>> pnfs_mark_request_commit() is called during writeback_done from >>> nfs_mark_request_commit, which gives the driver an opportunity to >>> claim it wants control over commiting a particular req. >>> >>> pnfs_choose_commit_list() is called from nfs_scan_list >>> to choose which list a given req should be added to, based on >>> where we intend to send it for COMMIT. It is up to the driver >>> to have preallocated list headers for each destination it may need. >>> >>> pnfs_commit_list() is how the driver actually takes control, it is >>> used instead of nfs_commit_list(). >>> >>> In order to pass information between the above functions, we create >>> a union in nfs_page to hold a lseg (which is possible because the req is >>> not on any list while in transition), and add some flags to indicate >>> if we need to use the pnfs code. >>> >>> Signed-off-by: Fred Isaman<iisaman@xxxxxxxxxx> >>> --- >>> fs/nfs/pagelist.c | 5 ++- >>> fs/nfs/pnfs.h | 73 >>> ++++++++++++++++++++++++++++++++++++++++++++++ >>> fs/nfs/write.c | 41 ++++++++++++++++--------- >>> include/linux/nfs_fs.h | 1 + >>> include/linux/nfs_page.h | 6 +++- >>> 5 files changed, 108 insertions(+), 18 deletions(-) >>> >>> diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c >>> index fd85618..87a593c 100644 >>> --- a/fs/nfs/pagelist.c >>> +++ b/fs/nfs/pagelist.c >>> @@ -398,6 +398,7 @@ int nfs_scan_list(struct nfs_inode *nfsi, >>> pgoff_t idx_end; >>> int found, i; >>> int res; >>> + struct list_head *list; >>> >>> res = 0; >>> if (npages == 0) >>> @@ -418,10 +419,10 @@ int nfs_scan_list(struct nfs_inode *nfsi, >>> idx_start = req->wb_index + 1; >>> if (nfs_set_page_tag_locked(req)) { >>> kref_get(&req->wb_kref); >>> - nfs_list_remove_request(req); >>> radix_tree_tag_clear(&nfsi->nfs_page_tree, >>> req->wb_index, tag); >>> - nfs_list_add_request(req, dst); >>> + list = pnfs_choose_commit_list(req, dst); >>> + nfs_list_add_request(req, list); >>> res++; >>> if (res == INT_MAX) >>> goto out; >>> diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h >>> index 6380b94..5370f1b 100644 >>> --- a/fs/nfs/pnfs.h >>> +++ b/fs/nfs/pnfs.h >>> @@ -74,6 +74,13 @@ struct pnfs_layoutdriver_type { >>> /* test for nfs page cache coalescing */ >>> int (*pg_test)(struct nfs_pageio_descriptor *, struct nfs_page *, >>> struct nfs_page *); >>> >>> + /* Returns true if layoutdriver wants to divert this request to >>> + * driver's commit routine. >>> + */ >>> + bool (*mark_pnfs_commit)(struct pnfs_layout_segment *lseg); >>> + struct list_head * (*choose_commit_list) (struct nfs_page *req); >>> + int (*commit_pagelist)(struct inode *inode, struct list_head >>> *mds_pages, int how); >>> + >>> /* >>> * Return PNFS_ATTEMPTED to indicate the layout code has attempted >>> * I/O, else return PNFS_NOT_ATTEMPTED to fall back to normal NFS >>> @@ -169,6 +176,51 @@ static inline int pnfs_enabled_sb(struct nfs_server >>> *nfss) >>> return nfss->pnfs_curr_ld != NULL; >>> } >>> >>> +static inline void >>> +pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment >>> *lseg) >>> +{ >>> + if (lseg) { >>> + struct pnfs_layoutdriver_type *ld; >>> + >>> + ld = >>> NFS_SERVER(req->wb_page->mapping->host)->pnfs_curr_ld; >>> + if (ld->mark_pnfs_commit&& ld->mark_pnfs_commit(lseg)) { >> >> nit: space before and after "&&" >> > > I'm ignoring the formatting nits due to mailer mishandling, per your > subsequent email. > >>> + set_bit(PG_PNFS_COMMIT,&req->wb_flags); >> >> nit: space after comma >> >>> + req->wb_commit_lseg = get_lseg(lseg); >> >> What are the atomicity requirements of the PG_PNFS_COMMIT bit in wb_flags >> and the validity of wb_commit_lseg? >> > > It is handled under the PG_BUSY bit_lock. > OK >>> + } >>> + } >>> +} >>> + >>> +static inline int >>> +pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int >>> how) >>> +{ >>> + if (!test_and_clear_bit(NFS_INO_PNFS_COMMIT,&NFS_I(inode)->flags)) >>> + return PNFS_NOT_ATTEMPTED; >>> + return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, >>> mds_pages, how); >> >> Again, I don't get the concurrency control here... >> NFS_INO_PNFS_COMMIT is set in pnfs_choose_commit_list under i_lock >> but then pnfs_commit_list is called outside the i_lock so what guarantees >> that NFS_INO_PNFS_COMMIT is still valid with respect to the output of >> pnfs_choose_commit_list? >> > > > They are both done under the NFS_INO_COMMIT bit lock. > >>> +} >>> + >>> +static inline struct list_head * >>> +pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds) >>> +{ >>> + struct list_head *rv; >>> + >>> + if (test_and_clear_bit(PG_PNFS_COMMIT,&req->wb_flags)) { >> >> nit: space after comma >> >>> + struct inode *inode = >>> req->wb_commit_lseg->pls_layout->plh_inode; >>> + >>> + set_bit(NFS_INO_PNFS_COMMIT,&NFS_I(inode)->flags); >> >> nit: ditto >> >>> + rv = >>> NFS_SERVER(inode)->pnfs_curr_ld->choose_commit_list(req); >>> + /* matched by ref taken when PG_PNFS_COMMIT is set */ >>> + put_lseg(req->wb_commit_lseg); >> >> Since wb_commit_lseg and wb_list are in a union, how about >> INIT_LIST_HEAD(&req->wb_list); > > I actually had that there. Trond pointed out it was unnecessary and > asked that it be removed. > Seems fragile to me. I hope Trond's around to fix it when it breaks ;-) >> >>> + } else >>> + rv = mds; >>> + return rv; >>> +} >>> + >>> +static inline void pnfs_clear_request_commit(struct nfs_page *req) >>> +{ >>> + if (test_and_clear_bit(PG_PNFS_COMMIT,&req->wb_flags)) >>> + put_lseg(req->wb_commit_lseg); >> >> ditto > > I see your ditto, and raise you one :) > :) >> >> Benny >> >>> +} >>> + >>> #else /* CONFIG_NFS_V4_1 */ >>> >>> static inline void pnfs_destroy_all_layouts(struct nfs_client *clp) >>> @@ -252,6 +304,27 @@ pnfs_pageio_init_write(struct nfs_pageio_descriptor >>> *pgio, struct inode *ino) >>> pgio->pg_test = NULL; >>> } >>> >>> +static inline void >>> +pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment >>> *lseg) >>> +{ >>> +} >>> + >>> +static inline int >>> +pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int >>> how) >>> +{ >>> + return PNFS_NOT_ATTEMPTED; >>> +} >>> + >>> +static inline struct list_head * >>> +pnfs_choose_commit_list(struct nfs_page *req, struct list_head *mds) >>> +{ >>> + return mds; >>> +} >>> + >>> +static inline void pnfs_clear_request_commit(struct nfs_page *req) >>> +{ >>> +} >>> + >>> #endif /* CONFIG_NFS_V4_1 */ >>> >>> #endif /* FS_NFS_PNFS_H */ >>> diff --git a/fs/nfs/write.c b/fs/nfs/write.c >>> index f5f005e..6927a18 100644 >>> --- a/fs/nfs/write.c >>> +++ b/fs/nfs/write.c >>> @@ -441,7 +441,7 @@ nfs_mark_request_dirty(struct nfs_page *req) >>> * Add a request to the inode's commit list. >>> */ >>> static void >>> -nfs_mark_request_commit(struct nfs_page *req) >>> +nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment >>> *lseg) >>> { >>> struct inode *inode = req->wb_context->path.dentry->d_inode; >>> struct nfs_inode *nfsi = NFS_I(inode); >>> @@ -453,6 +453,7 @@ nfs_mark_request_commit(struct nfs_page *req) >>> NFS_PAGE_TAG_COMMIT); >>> nfsi->ncommit++; >>> spin_unlock(&inode->i_lock); >>> + pnfs_mark_request_commit(req, lseg); >> >> Why do this out of the i_lock? >> > > Both the req and lseg are held in memory by references, and any > serialization needs are met by the req's PG_BUSY bit lock. > > Fred > >>> inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); >>> inc_bdi_stat(req->wb_page->mapping->backing_dev_info, >>> BDI_RECLAIMABLE); >>> __mark_inode_dirty(inode, I_DIRTY_DATASYNC); >>> @@ -481,10 +482,11 @@ int nfs_write_need_commit(struct nfs_write_data >>> *data) >>> } >>> >>> static inline >>> -int nfs_reschedule_unstable_write(struct nfs_page *req) >>> +int nfs_reschedule_unstable_write(struct nfs_page *req, >>> + struct nfs_write_data *data) >>> { >>> if (test_and_clear_bit(PG_NEED_COMMIT,&req->wb_flags)) { >>> - nfs_mark_request_commit(req); >>> + nfs_mark_request_commit(req, data->lseg); >>> return 1; >>> } >>> if (test_and_clear_bit(PG_NEED_RESCHED,&req->wb_flags)) { >>> @@ -495,7 +497,7 @@ int nfs_reschedule_unstable_write(struct nfs_page >>> *req) >>> } >>> #else >>> static inline void >>> -nfs_mark_request_commit(struct nfs_page *req) >>> +nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment >>> *lseg) >>> { >>> } >>> >>> @@ -512,7 +514,8 @@ int nfs_write_need_commit(struct nfs_write_data *data) >>> } >>> >>> static inline >>> -int nfs_reschedule_unstable_write(struct nfs_page *req) >>> +int nfs_reschedule_unstable_write(struct nfs_page *req, >>> + struct nfs_write_data *data) >>> { >>> return 0; >>> } >>> @@ -615,9 +618,11 @@ static struct nfs_page >>> *nfs_try_to_update_request(struct inode *inode, >>> } >>> >>> if (nfs_clear_request_commit(req)&& >>> - radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, >>> - req->wb_index, NFS_PAGE_TAG_COMMIT) != >>> NULL) >>> + radix_tree_tag_clear(&NFS_I(inode)->nfs_page_tree, >>> + req->wb_index, NFS_PAGE_TAG_COMMIT) != >>> NULL) { >>> NFS_I(inode)->ncommit--; >>> + pnfs_clear_request_commit(req); >>> + } >>> >>> /* Okay, the request matches. Update the region */ >>> if (offset< req->wb_offset) { >>> @@ -765,11 +770,12 @@ int nfs_updatepage(struct file *file, struct page >>> *page, >>> return status; >>> } >>> >>> -static void nfs_writepage_release(struct nfs_page *req) >>> +static void nfs_writepage_release(struct nfs_page *req, >>> + struct nfs_write_data *data) >>> { >>> struct page *page = req->wb_page; >>> >>> - if (PageError(req->wb_page) || >>> !nfs_reschedule_unstable_write(req)) >>> + if (PageError(req->wb_page) || !nfs_reschedule_unstable_write(req, >>> data)) >>> nfs_inode_remove_request(req); >>> nfs_clear_page_tag_locked(req); >>> nfs_end_page_writeback(page); >>> @@ -1087,7 +1093,7 @@ static void nfs_writeback_release_partial(void >>> *calldata) >>> >>> out: >>> if (atomic_dec_and_test(&req->wb_complete)) >>> - nfs_writepage_release(req); >>> + nfs_writepage_release(req, data); >>> nfs_writedata_release(calldata); >>> } >>> >>> @@ -1154,7 +1160,7 @@ static void nfs_writeback_release_full(void >>> *calldata) >>> >>> if (nfs_write_need_commit(data)) { >>> memcpy(&req->wb_verf,&data->verf, >>> sizeof(req->wb_verf)); >>> - nfs_mark_request_commit(req); >>> + nfs_mark_request_commit(req, data->lseg); >>> dprintk(" marked for commit\n"); >>> goto next; >>> } >>> @@ -1357,14 +1363,15 @@ static void nfs_init_commit(struct nfs_write_data >>> *data, >>> nfs_fattr_init(&data->fattr); >>> } >>> >>> -static void nfs_retry_commit(struct list_head *page_list) >>> +static void nfs_retry_commit(struct list_head *page_list, >>> + struct pnfs_layout_segment *lseg) >>> { >>> struct nfs_page *req; >>> >>> while (!list_empty(page_list)) { >>> req = nfs_list_entry(page_list->next); >>> nfs_list_remove_request(req); >>> - nfs_mark_request_commit(req); >>> + nfs_mark_request_commit(req, lseg); >>> dec_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); >>> dec_bdi_stat(req->wb_page->mapping->backing_dev_info, >>> BDI_RECLAIMABLE); >>> @@ -1389,7 +1396,7 @@ nfs_commit_list(struct inode *inode, struct >>> list_head *head, int how) >>> nfs_init_commit(data, head); >>> return nfs_initiate_commit(data, NFS_CLIENT(inode), data->mds_ops, >>> how); >>> out_bad: >>> - nfs_retry_commit(head); >>> + nfs_retry_commit(head, NULL); >>> nfs_commit_clear_lock(NFS_I(inode)); >>> return -ENOMEM; >>> } >>> @@ -1477,7 +1484,11 @@ int nfs_commit_inode(struct inode *inode, int how) >>> res = nfs_scan_commit(inode,&head, 0, 0); >>> spin_unlock(&inode->i_lock); >>> if (res) { >>> - int error = nfs_commit_list(inode,&head, how); >>> + int error; >>> + >>> + error = pnfs_commit_list(inode,&head, how); >>> + if (error == PNFS_NOT_ATTEMPTED) >>> + error = nfs_commit_list(inode,&head, how); >>> if (error< 0) >>> return error; >>> if (!may_wait) >>> diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h >>> index cb2add4..f64ea14 100644 >>> --- a/include/linux/nfs_fs.h >>> +++ b/include/linux/nfs_fs.h >>> @@ -221,6 +221,7 @@ struct nfs_inode { >>> #define NFS_INO_FSCACHE (5) /* inode can be >>> cached by FS-Cache */ >>> #define NFS_INO_FSCACHE_LOCK (6) /* FS-Cache cookie >>> management lock */ >>> #define NFS_INO_COMMIT (7) /* inode is >>> committing unstable writes */ >>> +#define NFS_INO_PNFS_COMMIT (8) /* use pnfs code for >>> commit */ >> >> nit: alignment >> >>> >>> static inline struct nfs_inode *NFS_I(const struct inode *inode) >>> { >>> diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h >>> index 92d54c8..8023e4e 100644 >>> --- a/include/linux/nfs_page.h >>> +++ b/include/linux/nfs_page.h >>> @@ -33,11 +33,15 @@ enum { >>> PG_CLEAN, >>> PG_NEED_COMMIT, >>> PG_NEED_RESCHED, >>> + PG_PNFS_COMMIT, >>> }; >>> >>> struct nfs_inode; >>> struct nfs_page { >>> - struct list_head wb_list; /* Defines state of page: >>> */ >>> + union { >>> + struct list_head wb_list; /* Defines state >>> of page: */ >>> + struct pnfs_layout_segment *wb_commit_lseg; /* Used when >>> PG_PNFS_COMMIT set */ >>> + }; >>> struct page *wb_page; /* page to read in/write >>> out */ >>> struct nfs_open_context *wb_context; /* File state context info >>> */ >>> struct nfs_lock_context *wb_lock_context; /* lock context >>> info */ >> >> -- >> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in >> the body of a message to majordomo@xxxxxxxxxxxxxxx >> More majordomo info at http://vger.kernel.org/majordomo-info.html >> -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html