On Tue, Jan 06, 2015 at 02:59:57PM -0500, Anna Schumaker wrote: > Hey Tom, > > On 12/24/2014 02:13 AM, Tom Haynes wrote: > > The flexfile layout is a new layout that extends the > > file layout. It is currently being drafted as a specification at > > https://datatracker.ietf.org/doc/draft-ietf-nfsv4-layout-types/ > > > > Signed-off-by: Weston Andros Adamson <dros@xxxxxxxxxxxxxxx> > > Signed-off-by: Tom Haynes <loghyr@xxxxxxxxxxxxxxx> > > Signed-off-by: Tao Peng <bergwolf@xxxxxxxxxxxxxxx> > > --- > > fs/nfs/Kconfig | 5 + > > fs/nfs/Makefile | 1 + > > fs/nfs/flexfilelayout/Makefile | 5 + > > fs/nfs/flexfilelayout/flexfilelayout.c | 1600 +++++++++++++++++++++++++++++ > > fs/nfs/flexfilelayout/flexfilelayout.h | 158 +++ > > fs/nfs/flexfilelayout/flexfilelayoutdev.c | 552 ++++++++++ > > include/linux/nfs4.h | 1 + > > 7 files changed, 2322 insertions(+) > > create mode 100644 fs/nfs/flexfilelayout/Makefile > > create mode 100644 fs/nfs/flexfilelayout/flexfilelayout.c > > create mode 100644 fs/nfs/flexfilelayout/flexfilelayout.h > > create mode 100644 fs/nfs/flexfilelayout/flexfilelayoutdev.c > > > > diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig > > index 3dece03..c7abc10 100644 > > --- a/fs/nfs/Kconfig > > +++ b/fs/nfs/Kconfig > > @@ -128,6 +128,11 @@ config PNFS_OBJLAYOUT > > depends on NFS_V4_1 && SCSI_OSD_ULD > > default NFS_V4 > > > > +config PNFS_FLEXFILE_LAYOUT > > + tristate > > + depends on NFS_V4_1 && NFS_V3 > > + default m > > + > > config NFS_V4_1_IMPLEMENTATION_ID_DOMAIN > > string "NFSv4.1 Implementation ID Domain" > > depends on NFS_V4_1 > > diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile > > index 7973c4e3..3c97bd9 100644 > > --- a/fs/nfs/Makefile > > +++ b/fs/nfs/Makefile > > @@ -33,3 +33,4 @@ nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o > > obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/ > > obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ > > obj-$(CONFIG_PNFS_BLOCK) += blocklayout/ > > +obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += flexfilelayout/ > > diff --git a/fs/nfs/flexfilelayout/Makefile b/fs/nfs/flexfilelayout/Makefile > > new file mode 100644 > > index 0000000..1d2c9f6 > > --- /dev/null > > +++ b/fs/nfs/flexfilelayout/Makefile > > @@ -0,0 +1,5 @@ > > +# > > +# Makefile for the pNFS Flexfile Layout Driver kernel module > > +# > > +obj-$(CONFIG_PNFS_FLEXFILE_LAYOUT) += nfs_layout_flexfiles.o > > +nfs_layout_flexfiles-y := flexfilelayout.o flexfilelayoutdev.o > > diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c > > new file mode 100644 > > index 0000000..fddd3e6 > > --- /dev/null > > +++ b/fs/nfs/flexfilelayout/flexfilelayout.c > > @@ -0,0 +1,1600 @@ > > +/* > > + * Module for pnfs flexfile layout driver. > > + * > > + * Copyright (c) 2014, Primary Data, Inc. All rights reserved. > > + * > > + * Tao Peng <bergwolf@xxxxxxxxxxxxxxx> > > + */ > > + > > +#include <linux/nfs_fs.h> > > +#include <linux/nfs_page.h> > > +#include <linux/module.h> > > + > > +#include <linux/sunrpc/metrics.h> > > + > > +#include "flexfilelayout.h" > > +#include "../nfs4session.h" > > +#include "../internal.h" > > +#include "../delegation.h" > > +#include "../nfs4trace.h" > > +#include "../iostat.h" > > +#include "../nfs.h" > > + > > +#define NFSDBG_FACILITY NFSDBG_PNFS_LD > > + > > +#define FF_LAYOUT_POLL_RETRY_MAX (15*HZ) > > + > > +static struct pnfs_layout_hdr * > > +ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) > > +{ > > + struct nfs4_flexfile_layout *ffl; > > + > > + ffl = kzalloc(sizeof(*ffl), gfp_flags); > > + INIT_LIST_HEAD(&ffl->error_list); > > + return ffl != NULL ? &ffl->generic_hdr : NULL; > > +} > > + > > +static void > > +ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo) > > +{ > > + struct nfs4_ff_layout_ds_err *err, *n; > > + > > + list_for_each_entry_safe(err, n, &FF_LAYOUT_FROM_HDR(lo)->error_list, > > + list) { > > + list_del(&err->list); > > + kfree(err); > > + } > > + kfree(FF_LAYOUT_FROM_HDR(lo)); > > +} > > + > > +static int decode_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid) > > +{ > > + __be32 *p; > > + > > + p = xdr_inline_decode(xdr, NFS4_STATEID_SIZE); > > + if (unlikely(p == NULL)) > > + return -ENOBUFS; > > + memcpy(stateid, p, NFS4_STATEID_SIZE); > > + dprintk("%s: stateid id= [%x%x%x%x]\n", __func__, > > + p[0], p[1], p[2], p[3]); > > + return 0; > > +} > > + > > +static int decode_deviceid(struct xdr_stream *xdr, struct nfs4_deviceid *devid) > > +{ > > + __be32 *p; > > + > > + p = xdr_inline_decode(xdr, NFS4_DEVICEID4_SIZE); > > + if (unlikely(!p)) > > + return -ENOBUFS; > > + memcpy(devid, p, NFS4_DEVICEID4_SIZE); > > + nfs4_print_deviceid(devid); > > + return 0; > > +} > > + > > +static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh) > > +{ > > + __be32 *p; > > + > > + p = xdr_inline_decode(xdr, 4); > > + if (unlikely(!p)) > > + return -ENOBUFS; > > + fh->size = be32_to_cpup(p++); > > + if (fh->size > sizeof(struct nfs_fh)) { > > + printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n", > > + fh->size); > > + return -EOVERFLOW; > > + } > > + /* fh.data */ > > + p = xdr_inline_decode(xdr, fh->size); > > + if (unlikely(!p)) > > + return -ENOBUFS; > > + memcpy(&fh->data, p, fh->size); > > + dprintk("%s: fh len %d\n", __func__, fh->size); > > + > > + return 0; > > +} > > + > > +/* > > + * we only handle AUTH_NONE and AUTH_UNIX for now. > > + * > > + * For AUTH_UNIX, we want to parse > > + * struct authsys_parms { > > + * unsigned int stamp; > > + * string machinename<255>; > > + * unsigned int uid; > > + * unsigned int gid; > > + * unsigned int gids<16>; > > + * }; > > + */ > > +static int > > +ff_layout_parse_auth(struct xdr_stream *xdr, > > + struct nfs4_ff_layout_mirror *mirror) > > +{ > > + __be32 *p; > > + int flavor, len, gid_it = 0; > > + > > + /* authflavor(4) + opaque_length(4)*/ > > + p = xdr_inline_decode(xdr, 8); > > + if (unlikely(!p)) > > + return -ENOBUFS; > > + flavor = be32_to_cpup(p++); > > + len = be32_to_cpup(p++); > > + if (flavor < RPC_AUTH_NULL || flavor >= RPC_AUTH_MAXFLAVOR || > > + len < 0) > > + return -EINVAL; > > + > > + dprintk("%s: flavor %u len %u\n", __func__, flavor, len); > > + > > + if (flavor == RPC_AUTH_NULL && len == 0) > > + goto out_fill; > > + > > + /* opaque body */ > > + p = xdr_inline_decode(xdr, len); > > + if (unlikely(!p)) > > + return -ENOBUFS; > > + > > + if (flavor == RPC_AUTH_NULL) { > > + mirror->uid = -1; > > + mirror->gid = -1; > > + } else if (flavor == RPC_AUTH_UNIX) { > > + int len2; > > + > > + p++; /* stamp */ > > + len2 = be32_to_cpup(p++); /* machinename length */ > > + dprintk("%s: machinename length %u\n", __func__, len2); > > + if (len2 < 0 || len2 >= len || len2 > 255) > > + return -EINVAL; > > + p += XDR_QUADLEN(len2); /* machinename */ > > + > > + mirror->uid = be32_to_cpup(p++); > > + mirror->gid = be32_to_cpup(p++); > > + > > + len2 = be32_to_cpup(p++); /* gid array length */ > > + dprintk("%s: gid array length %u\n", __func__, len2); > > + if (len2 > 16) > > + return -EINVAL; > > + for (; gid_it < len2; gid_it++) > > + mirror->gids[gid_it] = be32_to_cpup(p++); > > + } else { > > + return -EPROTONOSUPPORT; > > + } > > + > > +out_fill: > > + /* filling the rest of gids */ > > + for (; gid_it < 16; gid_it++) > > + mirror->gids[gid_it] = -1; > > + > > + return 0; > > +} > > + > > +static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls) > > +{ > > + int i; > > + > > + if (fls->mirror_array) { > > + for (i = 0; i < fls->mirror_array_cnt; i++) { > > + /* normally mirror_ds is freed in > > + * .free_deviceid_node but we still do it here > > + * for .alloc_lseg error path */ > > + if (fls->mirror_array[i]) { > > + kfree(fls->mirror_array[i]->fh_versions); > > + nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds); > > + kfree(fls->mirror_array[i]); > > + } > > + } > > + kfree(fls->mirror_array); > > + fls->mirror_array = NULL; > > + } > > +} > > + > > +static int ff_layout_check_layout(struct nfs4_layoutget_res *lgr) > > +{ > > + int ret = 0; > > + > > + dprintk("--> %s\n", __func__); > > + > > + /* FIXME: remove this check when layout segment support is added */ > > + if (lgr->range.offset != 0 || > > + lgr->range.length != NFS4_MAX_UINT64) { > > + dprintk("%s Only whole file layouts supported. Use MDS i/o\n", > > + __func__); > > + ret = -EINVAL; > > + } > > + > > + dprintk("--> %s returns %d\n", __func__, ret); > > + return ret; > > +} > > + > > +static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls) > > +{ > > + if (fls) { > > + ff_layout_free_mirror_array(fls); > > + kfree(fls); > > + } > > +} > > + > > +static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls) > > +{ > > + struct nfs4_ff_layout_mirror *tmp; > > + int i, j; > > + > > + for (i = 0; i < fls->mirror_array_cnt - 1; i++) { > > + for (j = i + 1; j < fls->mirror_array_cnt; j++) > > + if (fls->mirror_array[i]->efficiency < > > + fls->mirror_array[j]->efficiency) { > > + tmp = fls->mirror_array[i]; > > + fls->mirror_array[i] = fls->mirror_array[j]; > > + fls->mirror_array[j] = tmp; > > + } > > + } > > +} > > + > > +static struct pnfs_layout_segment * > > +ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, > > + struct nfs4_layoutget_res *lgr, > > + gfp_t gfp_flags) > > +{ > > + struct pnfs_layout_segment *ret; > > + struct nfs4_ff_layout_segment *fls = NULL; > > + struct xdr_stream stream; > > + struct xdr_buf buf; > > + struct page *scratch; > > + u64 stripe_unit; > > + u32 mirror_array_cnt; > > + __be32 *p; > > + int i, rc; > > + > > + dprintk("--> %s\n", __func__); > > + scratch = alloc_page(gfp_flags); > > + if (!scratch) > > + return ERR_PTR(-ENOMEM); > > + > > + xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, > > + lgr->layoutp->len); > > + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); > > + > > + /* stripe unit and mirror_array_cnt */ > > + rc = -EIO; > > + p = xdr_inline_decode(&stream, 8 + 4); > > + if (!p) > > + goto out_err_free; > > + > > + p = xdr_decode_hyper(p, &stripe_unit); > > + mirror_array_cnt = be32_to_cpup(p++); > > + dprintk("%s: stripe_unit=%llu mirror_array_cnt=%u\n", __func__, > > + stripe_unit, mirror_array_cnt); > > + > > + if (mirror_array_cnt > NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT || > > + mirror_array_cnt == 0) > > + goto out_err_free; > > + > > + rc = -ENOMEM; > > + fls = kzalloc(sizeof(*fls), gfp_flags); > > + if (!fls) > > + goto out_err_free; > > + > > + fls->mirror_array_cnt = mirror_array_cnt; > > + fls->stripe_unit = stripe_unit; > > + fls->mirror_array = kcalloc(fls->mirror_array_cnt, > > + sizeof(fls->mirror_array[0]), gfp_flags); > > + if (fls->mirror_array == NULL) > > + goto out_err_free; > > + > > + for (i = 0; i < fls->mirror_array_cnt; i++) { > > + struct nfs4_deviceid devid; > > + struct nfs4_deviceid_node *idnode; > > + u32 ds_count; > > + u32 fh_count; > > + int j; > > + > > + rc = -EIO; > > + p = xdr_inline_decode(&stream, 4); > > + if (!p) > > + goto out_err_free; > > + ds_count = be32_to_cpup(p); > > + > > + /* FIXME: allow for striping? */ > > + if (ds_count != 1) > > + goto out_err_free; > > + > > + fls->mirror_array[i] = > > + kzalloc(sizeof(struct nfs4_ff_layout_mirror), > > + gfp_flags); > > + if (fls->mirror_array[i] == NULL) { > > + rc = -ENOMEM; > > + goto out_err_free; > > + } > > + > > + spin_lock_init(&fls->mirror_array[i]->lock); > > + fls->mirror_array[i]->ds_count = ds_count; > > + > > + /* deviceid */ > > + rc = decode_deviceid(&stream, &devid); > > + if (rc) > > + goto out_err_free; > > + > > + idnode = nfs4_find_get_deviceid(NFS_SERVER(lh->plh_inode), > > + &devid, lh->plh_lc_cred, > > + gfp_flags); > > + /* > > + * upon success, mirror_ds is allocated by previous > > + * getdeviceinfo, or newly by .alloc_deviceid_node > > + * nfs4_find_get_deviceid failure is indeed getdeviceinfo falure > > + */ > > + if (idnode) > > + fls->mirror_array[i]->mirror_ds = > > + FF_LAYOUT_MIRROR_DS(idnode); > > + else > > + goto out_err_free; > > + > > + /* efficiency */ > > + rc = -EIO; > > + p = xdr_inline_decode(&stream, 4); > > + if (!p) > > + goto out_err_free; > > + fls->mirror_array[i]->efficiency = be32_to_cpup(p); > > + > > + /* stateid */ > > + rc = decode_stateid(&stream, &fls->mirror_array[i]->stateid); > > + if (rc) > > + goto out_err_free; > > + > > + /* fh */ > > + p = xdr_inline_decode(&stream, 4); > > + if (!p) > > + goto out_err_free; > > + fh_count = be32_to_cpup(p); > > + > > + fls->mirror_array[i]->fh_versions = > > + kzalloc(fh_count * sizeof(struct nfs_fh), > > + gfp_flags); > > + if (fls->mirror_array[i]->fh_versions == NULL) { > > + rc = -ENOMEM; > > + goto out_err_free; > > + } > > + > > + for (j = 0; j < fh_count; j++) { > > + rc = decode_nfs_fh(&stream, > > + &fls->mirror_array[i]->fh_versions[j]); > > + if (rc) > > + goto out_err_free; > > + } > > + > > + fls->mirror_array[i]->fh_versions_cnt = fh_count; > > + > > + /* opaque_auth */ > > + rc = ff_layout_parse_auth(&stream, fls->mirror_array[i]); > > + if (rc) > > + goto out_err_free; > > + > > + dprintk("%s: uid %d gid %d\n", __func__, > > + fls->mirror_array[i]->uid, > > + fls->mirror_array[i]->gid); > > + } > > + > > + ff_layout_sort_mirrors(fls); > > + rc = ff_layout_check_layout(lgr); > > + if (rc) > > + goto out_err_free; > > + > > + ret = &fls->generic_hdr; > > + dprintk("<-- %s (success)\n", __func__); > > +out_free_page: > > + __free_page(scratch); > > + return ret; > > +out_err_free: > > + _ff_layout_free_lseg(fls); > > + ret = ERR_PTR(rc); > > + dprintk("<-- %s (%d)\n", __func__, rc); > > + goto out_free_page; > > +} > > + > > +static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout) > > +{ > > + struct pnfs_layout_segment *lseg; > > + > > + list_for_each_entry(lseg, &layout->plh_segs, pls_list) > > + if (lseg->pls_range.iomode == IOMODE_RW) > > + return true; > > + > > + return false; > > +} > > + > > +static void > > +ff_layout_free_lseg(struct pnfs_layout_segment *lseg) > > +{ > > + struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); > > + int i; > > + > > + dprintk("--> %s\n", __func__); > > + > > + for (i = 0; i < fls->mirror_array_cnt; i++) { > > + if (fls->mirror_array[i]) { > > + nfs4_ff_layout_put_deviceid(fls->mirror_array[i]->mirror_ds); > > + fls->mirror_array[i]->mirror_ds = NULL; > > + if (fls->mirror_array[i]->cred) { > > + put_rpccred(fls->mirror_array[i]->cred); > > + fls->mirror_array[i]->cred = NULL; > > + } > > + } > > + } > > + > > + if (lseg->pls_range.iomode == IOMODE_RW) { > > + struct nfs4_flexfile_layout *ffl; > > + struct inode *inode; > > + > > + ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout); > > + inode = ffl->generic_hdr.plh_inode; > > + spin_lock(&inode->i_lock); > > + if (!ff_layout_has_rw_segments(lseg->pls_layout)) { > > + ffl->commit_info.nbuckets = 0; > > + kfree(ffl->commit_info.buckets); > > + ffl->commit_info.buckets = NULL; > > + } > > + spin_unlock(&inode->i_lock); > > + } > > + _ff_layout_free_lseg(fls); > > +} > > + > > +/* Return 1 until we have multiple lsegs support */ > > +static int > > +ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls) > > +{ > > + return 1; > > +} > > + > > +static int > > +ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg, > > + struct nfs_commit_info *cinfo, > > + gfp_t gfp_flags) > > +{ > > + struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); > > + struct pnfs_commit_bucket *buckets; > > + int size; > > + > > + if (cinfo->ds->nbuckets != 0) { > > + /* This assumes there is only one RW lseg per file. > > + * To support multiple lseg per file, we need to > > + * change struct pnfs_commit_bucket to allow dynamic > > + * increasing nbuckets. > > + */ > > + return 0; > > + } > > + > > + size = ff_layout_get_lseg_count(fls) * FF_LAYOUT_MIRROR_COUNT(lseg); > > + > > + buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket), > > + gfp_flags); > > + if (!buckets) > > + return -ENOMEM; > > + else { > > + int i; > > + > > + spin_lock(cinfo->lock); > > + if (cinfo->ds->nbuckets != 0) > > + kfree(buckets); > > + else { > > + cinfo->ds->buckets = buckets; > > + cinfo->ds->nbuckets = size; > > + for (i = 0; i < size; i++) { > > + INIT_LIST_HEAD(&buckets[i].written); > > + INIT_LIST_HEAD(&buckets[i].committing); > > + /* mark direct verifier as unset */ > > + buckets[i].direct_verf.committed = > > + NFS_INVALID_STABLE_HOW; > > + } > > + } > > + spin_unlock(cinfo->lock); > > + return 0; > > + } > > +} > > + > > +static struct nfs4_pnfs_ds * > > +ff_layout_choose_best_ds_for_read(struct nfs_pageio_descriptor *pgio, > > + int *best_idx) > > +{ > > + struct nfs4_ff_layout_segment *fls; > > + struct nfs4_pnfs_ds *ds; > > + int idx; > > + > > + fls = FF_LAYOUT_LSEG(pgio->pg_lseg); > > + /* mirrors are sorted by efficiency */ > > + for (idx = 0; idx < fls->mirror_array_cnt; idx++) { > > + ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, idx, false); > > + if (ds) { > > + *best_idx = idx; > > + return ds; > > + } > > + } > > + > > + return NULL; > > +} > > + > > +static void > > +ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, > > + struct nfs_page *req) > > +{ > > + struct nfs_pgio_mirror *pgm; > > + struct nfs4_ff_layout_mirror *mirror; > > + struct nfs4_pnfs_ds *ds; > > + int ds_idx; > > + > > + /* Use full layout for now */ > > + if (!pgio->pg_lseg) > > + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, > > + req->wb_context, > > + 0, > > + NFS4_MAX_UINT64, > > + IOMODE_READ, > > + GFP_KERNEL); > > + /* If no lseg, fall back to read through mds */ > > + if (pgio->pg_lseg == NULL) > > + goto out_mds; > > + > > + ds = ff_layout_choose_best_ds_for_read(pgio, &ds_idx); > > + if (!ds) > > + goto out_mds; > > + mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx); > > + > > + pgio->pg_mirror_idx = ds_idx; > > + > > + /* read always uses only one mirror - idx 0 for pgio layer */ > > + pgm = &pgio->pg_mirrors[0]; > > + pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize; > > + > > + return; > > +out_mds: > > + pnfs_put_lseg(pgio->pg_lseg); > > + pgio->pg_lseg = NULL; > > + nfs_pageio_reset_read_mds(pgio); > > +} > > + > > +static void > > +ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, > > + struct nfs_page *req) > > +{ > > + struct nfs4_ff_layout_mirror *mirror; > > + struct nfs_pgio_mirror *pgm; > > + struct nfs_commit_info cinfo; > > + struct nfs4_pnfs_ds *ds; > > + int i; > > + int status; > > + > > + if (!pgio->pg_lseg) > > + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, > > + req->wb_context, > > + 0, > > + NFS4_MAX_UINT64, > > + IOMODE_RW, > > + GFP_NOFS); > > + /* If no lseg, fall back to write through mds */ > > + if (pgio->pg_lseg == NULL) > > + goto out_mds; > > + > > + nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq); > > + status = ff_layout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS); > > + if (status < 0) > > + goto out_mds; > > + > > + /* Use a direct mapping of ds_idx to pgio mirror_idx */ > > + if (WARN_ON_ONCE(pgio->pg_mirror_count != > > + FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg))) > > + goto out_mds; > > + > > + for (i = 0; i < pgio->pg_mirror_count; i++) { > > + ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, i, true); > > + if (!ds) > > + goto out_mds; > > + pgm = &pgio->pg_mirrors[i]; > > + mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); > > + pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize; > > + } > > + > > + return; > > + > > +out_mds: > > + pnfs_put_lseg(pgio->pg_lseg); > > + pgio->pg_lseg = NULL; > > + nfs_pageio_reset_write_mds(pgio); > > +} > > + > > +static unsigned int > > +ff_layout_pg_get_mirror_count_write(struct nfs_pageio_descriptor *pgio, > > + struct nfs_page *req) > > +{ > > + if (!pgio->pg_lseg) > > + pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode, > > + req->wb_context, > > + 0, > > + NFS4_MAX_UINT64, > > + IOMODE_RW, > > + GFP_NOFS); > > + if (pgio->pg_lseg) > > + return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg); > > + > > + /* no lseg means that pnfs is not in use, so no mirroring here */ > > + pnfs_put_lseg(pgio->pg_lseg); > > + pgio->pg_lseg = NULL; > > + nfs_pageio_reset_write_mds(pgio); > > + return 1; > > +} > > + > > +static const struct nfs_pageio_ops ff_layout_pg_read_ops = { > > + .pg_init = ff_layout_pg_init_read, > > + .pg_test = pnfs_generic_pg_test, > > + .pg_doio = pnfs_generic_pg_readpages, > > + .pg_cleanup = pnfs_generic_pg_cleanup, > > +}; > > + > > +static const struct nfs_pageio_ops ff_layout_pg_write_ops = { > > + .pg_init = ff_layout_pg_init_write, > > + .pg_test = pnfs_generic_pg_test, > > + .pg_doio = pnfs_generic_pg_writepages, > > + .pg_get_mirror_count = ff_layout_pg_get_mirror_count_write, > > + .pg_cleanup = pnfs_generic_pg_cleanup, > > +}; > > + > > +static void ff_layout_reset_write(struct nfs_pgio_header *hdr, bool retry_pnfs) > > +{ > > + struct rpc_task *task = &hdr->task; > > + > > + pnfs_layoutcommit_inode(hdr->inode, false); > > + > > + if (retry_pnfs) { > > + dprintk("%s Reset task %5u for i/o through pNFS " > > + "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, > > + hdr->task.tk_pid, > > + hdr->inode->i_sb->s_id, > > + (unsigned long long)NFS_FILEID(hdr->inode), > > + hdr->args.count, > > + (unsigned long long)hdr->args.offset); > > + > > + if (!hdr->dreq) { > > + struct nfs_open_context *ctx; > > + > > + ctx = nfs_list_entry(hdr->pages.next)->wb_context; > > + set_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags); > > + hdr->completion_ops->error_cleanup(&hdr->pages); > > + } else { > > + nfs_direct_set_resched_writes(hdr->dreq); > > + } > > + return; > > + } > > + > > + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { > > + dprintk("%s Reset task %5u for i/o through MDS " > > + "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, > > + hdr->task.tk_pid, > > + hdr->inode->i_sb->s_id, > > + (unsigned long long)NFS_FILEID(hdr->inode), > > + hdr->args.count, > > + (unsigned long long)hdr->args.offset); > > + > > + task->tk_status = pnfs_write_done_resend_to_mds(hdr); > > + } > > +} > > + > > +static void ff_layout_reset_read(struct nfs_pgio_header *hdr) > > +{ > > + struct rpc_task *task = &hdr->task; > > + > > + pnfs_layoutcommit_inode(hdr->inode, false); > > + > > + if (!test_and_set_bit(NFS_IOHDR_REDO, &hdr->flags)) { > > + dprintk("%s Reset task %5u for i/o through MDS " > > + "(req %s/%llu, %u bytes @ offset %llu)\n", __func__, > > + hdr->task.tk_pid, > > + hdr->inode->i_sb->s_id, > > + (unsigned long long)NFS_FILEID(hdr->inode), > > + hdr->args.count, > > + (unsigned long long)hdr->args.offset); > > + > > + task->tk_status = pnfs_read_done_resend_to_mds(hdr); > > + } > > +} > > + > > +static int ff_layout_async_handle_error_v4(struct rpc_task *task, > > + struct nfs4_state *state, > > + struct nfs_client *clp, > > + struct pnfs_layout_segment *lseg, > > + int idx) > > +{ > > + struct pnfs_layout_hdr *lo = lseg->pls_layout; > > + struct inode *inode = lo->plh_inode; > > + struct nfs_server *mds_server = NFS_SERVER(inode); > > + > > + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); > > + struct nfs_client *mds_client = mds_server->nfs_client; > > + struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table; > > + > > + if (task->tk_status >= 0) > > + return 0; > > + > > + switch (task->tk_status) { > > + /* MDS state errors */ > > + case -NFS4ERR_DELEG_REVOKED: > > + case -NFS4ERR_ADMIN_REVOKED: > > + case -NFS4ERR_BAD_STATEID: > > + if (state == NULL) > > + break; > > + nfs_remove_bad_delegation(state->inode); > > + case -NFS4ERR_OPENMODE: > > + if (state == NULL) > > + break; > > + if (nfs4_schedule_stateid_recovery(mds_server, state) < 0) > > + goto out_bad_stateid; > > + goto wait_on_recovery; > > + case -NFS4ERR_EXPIRED: > > + if (state != NULL) { > > + if (nfs4_schedule_stateid_recovery(mds_server, state) < 0) > > + goto out_bad_stateid; > > + } > > + nfs4_schedule_lease_recovery(mds_client); > > + goto wait_on_recovery; > > + /* DS session errors */ > > + case -NFS4ERR_BADSESSION: > > + case -NFS4ERR_BADSLOT: > > + case -NFS4ERR_BAD_HIGH_SLOT: > > + case -NFS4ERR_DEADSESSION: > > + case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION: > > + case -NFS4ERR_SEQ_FALSE_RETRY: > > + case -NFS4ERR_SEQ_MISORDERED: > > + dprintk("%s ERROR %d, Reset session. Exchangeid " > > + "flags 0x%x\n", __func__, task->tk_status, > > + clp->cl_exchange_flags); > > + nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); > > + break; > > + case -NFS4ERR_DELAY: > > + case -NFS4ERR_GRACE: > > + rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX); > > + break; > > + case -NFS4ERR_RETRY_UNCACHED_REP: > > + break; > > + /* Invalidate Layout errors */ > > + case -NFS4ERR_PNFS_NO_LAYOUT: > > + case -ESTALE: /* mapped NFS4ERR_STALE */ > > + case -EBADHANDLE: /* mapped NFS4ERR_BADHANDLE */ > > + case -EISDIR: /* mapped NFS4ERR_ISDIR */ > > + case -NFS4ERR_FHEXPIRED: > > + case -NFS4ERR_WRONG_TYPE: > > + dprintk("%s Invalid layout error %d\n", __func__, > > + task->tk_status); > > + /* > > + * Destroy layout so new i/o will get a new layout. > > + * Layout will not be destroyed until all current lseg > > + * references are put. Mark layout as invalid to resend failed > > + * i/o and all i/o waiting on the slot table to the MDS until > > + * layout is destroyed and a new valid layout is obtained. > > + */ > > + pnfs_destroy_layout(NFS_I(inode)); > > + rpc_wake_up(&tbl->slot_tbl_waitq); > > + goto reset; > > + /* RPC connection errors */ > > + case -ECONNREFUSED: > > + case -EHOSTDOWN: > > + case -EHOSTUNREACH: > > + case -ENETUNREACH: > > + case -EIO: > > + case -ETIMEDOUT: > > + case -EPIPE: > > + dprintk("%s DS connection error %d\n", __func__, > > + task->tk_status); > > + nfs4_mark_deviceid_unavailable(devid); > > + rpc_wake_up(&tbl->slot_tbl_waitq); > > + /* fall through */ > > + default: > > + if (ff_layout_has_available_ds(lseg)) > > + return -NFS4ERR_RESET_TO_PNFS; > > +reset: > > + dprintk("%s Retry through MDS. Error %d\n", __func__, > > + task->tk_status); > > + return -NFS4ERR_RESET_TO_MDS; > > + } > > +out: > > + task->tk_status = 0; > > + return -EAGAIN; > > +out_bad_stateid: > > + task->tk_status = -EIO; > > + return 0; > > +wait_on_recovery: > > + rpc_sleep_on(&mds_client->cl_rpcwaitq, task, NULL); > > + if (test_bit(NFS4CLNT_MANAGER_RUNNING, &mds_client->cl_state) == 0) > > + rpc_wake_up_queued_task(&mds_client->cl_rpcwaitq, task); > > + goto out; > > +} > > + > > +/* Retry all errors through either pNFS or MDS except for -EJUKEBOX */ > > +static int ff_layout_async_handle_error_v3(struct rpc_task *task, > > + struct pnfs_layout_segment *lseg, > > + int idx) > > +{ > > + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); > > + > > + if (task->tk_status >= 0) > > + return 0; > > + > > + if (task->tk_status != -EJUKEBOX) { > > + dprintk("%s DS connection error %d\n", __func__, > > + task->tk_status); > > + nfs4_mark_deviceid_unavailable(devid); > > + if (ff_layout_has_available_ds(lseg)) > > + return -NFS4ERR_RESET_TO_PNFS; > > + else > > + return -NFS4ERR_RESET_TO_MDS; > > + } > > + > > + if (task->tk_status == -EJUKEBOX) > > + nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); > > + task->tk_status = 0; > > + rpc_restart_call(task); > > + rpc_delay(task, NFS_JUKEBOX_RETRY_TIME); > > + return -EAGAIN; > > +} > > + > > +static int ff_layout_async_handle_error(struct rpc_task *task, > > + struct nfs4_state *state, > > + struct nfs_client *clp, > > + struct pnfs_layout_segment *lseg, > > + int idx) > > +{ > > + int vers = clp->cl_nfs_mod->rpc_vers->number; > > + > > + switch (vers) { > > + case 3: > > + return ff_layout_async_handle_error_v3(task, lseg, idx); > > + case 4: > > + return ff_layout_async_handle_error_v4(task, state, clp, > > + lseg, idx); > > + default: > > + /* should never happen */ > > + WARN_ON_ONCE(1); > > + return 0; > > + } > > +} > > + > > +static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, > > + int idx, u64 offset, u64 length, > > + u32 status, int opnum) > > +{ > > + struct nfs4_ff_layout_mirror *mirror; > > + int err; > > + > > + mirror = FF_LAYOUT_COMP(lseg, idx); > > + err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), > > + mirror, offset, length, status, opnum, > > + GFP_NOIO); > > + dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status); > > +} > > + > > +/* NFS_PROTO call done callback routines */ > > + > > +static int ff_layout_read_done_cb(struct rpc_task *task, > > + struct nfs_pgio_header *hdr) > > +{ > > + struct inode *inode; > > + int err; > > + > > + trace_nfs4_pnfs_read(hdr, task->tk_status); > > + if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status) > > + hdr->res.op_status = NFS4ERR_NXIO; > > + if (task->tk_status < 0 && hdr->res.op_status) > > + ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx, > > + hdr->args.offset, hdr->args.count, > > + hdr->res.op_status, OP_READ); > > + err = ff_layout_async_handle_error(task, hdr->args.context->state, > > + hdr->ds_clp, hdr->lseg, > > + hdr->pgio_mirror_idx); > > + > > + switch (err) { > > + case -NFS4ERR_RESET_TO_PNFS: > > + set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, > > + &hdr->lseg->pls_layout->plh_flags); > > + pnfs_read_resend_pnfs(hdr); > > + return task->tk_status; > > + case -NFS4ERR_RESET_TO_MDS: > > + inode = hdr->lseg->pls_layout->plh_inode; > > + pnfs_error_mark_layout_for_return(inode, hdr->lseg); > > + ff_layout_reset_read(hdr); > > + return task->tk_status; > > + case -EAGAIN: > > + rpc_restart_call_prepare(task); > > + return -EAGAIN; > > + } > > + > > + return 0; > > +} > > + > > +/* > > + * We reference the rpc_cred of the first WRITE that triggers the need for > > + * a LAYOUTCOMMIT, and use it to send the layoutcommit compound. > > + * rfc5661 is not clear about which credential should be used. > > + * > > + * Flexlayout client should treat DS replied FILE_SYNC as DATA_SYNC, so > > + * to follow http://www.rfc-editor.org/errata_search.php?rfc=5661&eid=2751 > > + * we always send layoutcommit after DS writes. > > + */ > > +static void > > +ff_layout_set_layoutcommit(struct nfs_pgio_header *hdr) > > +{ > > + pnfs_set_layoutcommit(hdr); > > + dprintk("%s inode %lu pls_end_pos %lu\n", __func__, hdr->inode->i_ino, > > + (unsigned long) NFS_I(hdr->inode)->layout->plh_lwb); > > +} > > + > > +static bool > > +ff_layout_reset_to_mds(struct pnfs_layout_segment *lseg, int idx) > > +{ > > + /* No mirroring for now */ > > + struct nfs4_deviceid_node *node = FF_LAYOUT_DEVID_NODE(lseg, idx); > > + > > + return ff_layout_test_devid_unavailable(node); > > +} > > + > > +static int ff_layout_read_prepare_common(struct rpc_task *task, > > + struct nfs_pgio_header *hdr) > > +{ > > + if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { > > + rpc_exit(task, -EIO); > > + return -EIO; > > + } > > + if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) { > > + dprintk("%s task %u reset io to MDS\n", __func__, task->tk_pid); > > + if (ff_layout_has_available_ds(hdr->lseg)) > > + pnfs_read_resend_pnfs(hdr); > > + else > > + ff_layout_reset_read(hdr); > > + rpc_exit(task, 0); > > + return -EAGAIN; > > + } > > + hdr->pgio_done_cb = ff_layout_read_done_cb; > > + > > + return 0; > > +} > > + > > +/* > > + * Call ops for the async read/write cases > > + * In the case of dense layouts, the offset needs to be reset to its > > + * original value. > > + */ > > +static void ff_layout_read_prepare_v3(struct rpc_task *task, void *data) > > +{ > > + struct nfs_pgio_header *hdr = data; > > + > > + if (ff_layout_read_prepare_common(task, hdr)) > > + return; > > + > > + rpc_call_start(task); > > +} > > + > > +static int ff_layout_setup_sequence(struct nfs_client *ds_clp, > > + struct nfs4_sequence_args *args, > > + struct nfs4_sequence_res *res, > > + struct rpc_task *task) > > +{ > > + if (ds_clp->cl_session) > > + return nfs41_setup_sequence(ds_clp->cl_session, > > + args, > > + res, > > + task); > > + return nfs40_setup_sequence(ds_clp->cl_slot_tbl, > > + args, > > + res, > > + task); > > I'm not quite seeing how we would end up calling the NFS v4.0 function here. If there is a session, then we call the 4.1 function, else we call the 4.0 one. > > > > +} > > + > > +static void ff_layout_read_prepare_v4(struct rpc_task *task, void *data) > > +{ > > + struct nfs_pgio_header *hdr = data; > > + > > + if (ff_layout_read_prepare_common(task, hdr)) > > + return; > > + > > + if (ff_layout_setup_sequence(hdr->ds_clp, > > + &hdr->args.seq_args, > > + &hdr->res.seq_res, > > + task)) > > + return; > > + > > + if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, > > + hdr->args.lock_context, FMODE_READ) == -EIO) > > + rpc_exit(task, -EIO); /* lost lock, terminate I/O */ > > +} > > + > > +static void ff_layout_read_call_done(struct rpc_task *task, void *data) > > +{ > > + struct nfs_pgio_header *hdr = data; > > + > > + dprintk("--> %s task->tk_status %d\n", __func__, task->tk_status); > > + > > + if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && > > + task->tk_status == 0) { > > + nfs4_sequence_done(task, &hdr->res.seq_res); > > + return; > > + } > > + > > + /* Note this may cause RPC to be resent */ > > + hdr->mds_ops->rpc_call_done(task, hdr); > > +} > > + > > +static void ff_layout_read_count_stats(struct rpc_task *task, void *data) > > +{ > > + struct nfs_pgio_header *hdr = data; > > + > > + rpc_count_iostats_metrics(task, > > + &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_READ]); > > +} > > + > > +static int ff_layout_write_done_cb(struct rpc_task *task, > > + struct nfs_pgio_header *hdr) > > +{ > > + struct inode *inode; > > + int err; > > + > > + trace_nfs4_pnfs_write(hdr, task->tk_status); > > + if (task->tk_status == -ETIMEDOUT && !hdr->res.op_status) > > + hdr->res.op_status = NFS4ERR_NXIO; > > + if (task->tk_status < 0 && hdr->res.op_status) > > + ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx, > > + hdr->args.offset, hdr->args.count, > > + hdr->res.op_status, OP_WRITE); > > + err = ff_layout_async_handle_error(task, hdr->args.context->state, > > + hdr->ds_clp, hdr->lseg, > > + hdr->pgio_mirror_idx); > > + > > + switch (err) { > > + case -NFS4ERR_RESET_TO_PNFS: > > + case -NFS4ERR_RESET_TO_MDS: > > + inode = hdr->lseg->pls_layout->plh_inode; > > + pnfs_error_mark_layout_for_return(inode, hdr->lseg); > > + if (err == -NFS4ERR_RESET_TO_PNFS) { > > + pnfs_set_retry_layoutget(hdr->lseg->pls_layout); > > + ff_layout_reset_write(hdr, true); > > + } else { > > + pnfs_clear_retry_layoutget(hdr->lseg->pls_layout); > > + ff_layout_reset_write(hdr, false); > > + } > > + return task->tk_status; > > + case -EAGAIN: > > + rpc_restart_call_prepare(task); > > + return -EAGAIN; > > + } > > + > > + if (hdr->res.verf->committed == NFS_FILE_SYNC || > > + hdr->res.verf->committed == NFS_DATA_SYNC) > > + ff_layout_set_layoutcommit(hdr); > > + > > + return 0; > > +} > > + > > +static int ff_layout_commit_done_cb(struct rpc_task *task, > > + struct nfs_commit_data *data) > > +{ > > + struct inode *inode; > > + int err; > > + > > + trace_nfs4_pnfs_commit_ds(data, task->tk_status); > > + if (task->tk_status == -ETIMEDOUT && !data->res.op_status) > > + data->res.op_status = NFS4ERR_NXIO; > > + if (task->tk_status < 0 && data->res.op_status) > > + ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index, > > + data->args.offset, data->args.count, > > + data->res.op_status, OP_COMMIT); > > + err = ff_layout_async_handle_error(task, NULL, data->ds_clp, > > + data->lseg, data->ds_commit_index); > > + > > + switch (err) { > > + case -NFS4ERR_RESET_TO_PNFS: > > + case -NFS4ERR_RESET_TO_MDS: > > + inode = data->lseg->pls_layout->plh_inode; > > + pnfs_error_mark_layout_for_return(inode, data->lseg); > > + if (err == -NFS4ERR_RESET_TO_PNFS) > > + pnfs_set_retry_layoutget(data->lseg->pls_layout); > > + else > > + pnfs_clear_retry_layoutget(data->lseg->pls_layout); > > + pnfs_generic_prepare_to_resend_writes(data); > > + return -EAGAIN; > > + case -EAGAIN: > > + rpc_restart_call_prepare(task); > > + return -EAGAIN; > > + } > > + > > + if (data->verf.committed == NFS_UNSTABLE) > > + pnfs_commit_set_layoutcommit(data); > > + > > + return 0; > > +} > > + > > +static int ff_layout_write_prepare_common(struct rpc_task *task, > > + struct nfs_pgio_header *hdr) > > +{ > > + if (unlikely(test_bit(NFS_CONTEXT_BAD, &hdr->args.context->flags))) { > > + rpc_exit(task, -EIO); > > + return -EIO; > > + } > > + > > + if (ff_layout_reset_to_mds(hdr->lseg, hdr->pgio_mirror_idx)) { > > + bool retry_pnfs; > > + > > + retry_pnfs = ff_layout_has_available_ds(hdr->lseg); > > + dprintk("%s task %u reset io to %s\n", __func__, > > + task->tk_pid, retry_pnfs ? "pNFS" : "MDS"); > > + ff_layout_reset_write(hdr, retry_pnfs); > > + rpc_exit(task, 0); > > + return -EAGAIN; > > + } > > + > > + return 0; > > +} > > + > > +static void ff_layout_write_prepare_v3(struct rpc_task *task, void *data) > > +{ > > + struct nfs_pgio_header *hdr = data; > > + > > + if (ff_layout_write_prepare_common(task, hdr)) > > + return; > > + > > + rpc_call_start(task); > > +} > > + > > +static void ff_layout_write_prepare_v4(struct rpc_task *task, void *data) > > +{ > > + struct nfs_pgio_header *hdr = data; > > + > > + if (ff_layout_write_prepare_common(task, hdr)) > > + return; > > + > > + if (ff_layout_setup_sequence(hdr->ds_clp, > > + &hdr->args.seq_args, > > + &hdr->res.seq_res, > > + task)) > > + return; > > + > > + if (nfs4_set_rw_stateid(&hdr->args.stateid, hdr->args.context, > > + hdr->args.lock_context, FMODE_WRITE) == -EIO) > > + rpc_exit(task, -EIO); /* lost lock, terminate I/O */ > > +} > > + > > +static void ff_layout_write_call_done(struct rpc_task *task, void *data) > > +{ > > + struct nfs_pgio_header *hdr = data; > > + > > + if (test_bit(NFS_IOHDR_REDO, &hdr->flags) && > > + task->tk_status == 0) { > > + nfs4_sequence_done(task, &hdr->res.seq_res); > > + return; > > + } > > + > > + /* Note this may cause RPC to be resent */ > > + hdr->mds_ops->rpc_call_done(task, hdr); > > +} > > + > > +static void ff_layout_write_count_stats(struct rpc_task *task, void *data) > > +{ > > + struct nfs_pgio_header *hdr = data; > > + > > + rpc_count_iostats_metrics(task, > > + &NFS_CLIENT(hdr->inode)->cl_metrics[NFSPROC4_CLNT_WRITE]); > > +} > > + > > +static void ff_layout_commit_prepare_v3(struct rpc_task *task, void *data) > > +{ > > + rpc_call_start(task); > > +} > > + > > +static void ff_layout_commit_prepare_v4(struct rpc_task *task, void *data) > > +{ > > + struct nfs_commit_data *wdata = data; > > + > > + ff_layout_setup_sequence(wdata->ds_clp, > > + &wdata->args.seq_args, > > + &wdata->res.seq_res, > > + task); > > +} > > + > > +static void ff_layout_commit_count_stats(struct rpc_task *task, void *data) > > +{ > > + struct nfs_commit_data *cdata = data; > > + > > + rpc_count_iostats_metrics(task, > > + &NFS_CLIENT(cdata->inode)->cl_metrics[NFSPROC4_CLNT_COMMIT]); > > +} > > + > > +static const struct rpc_call_ops ff_layout_read_call_ops_v3 = { > > + .rpc_call_prepare = ff_layout_read_prepare_v3, > > + .rpc_call_done = ff_layout_read_call_done, > > + .rpc_count_stats = ff_layout_read_count_stats, > > + .rpc_release = pnfs_generic_rw_release, > > +}; > > + > > +static const struct rpc_call_ops ff_layout_read_call_ops_v4 = { > > + .rpc_call_prepare = ff_layout_read_prepare_v4, > > + .rpc_call_done = ff_layout_read_call_done, > > + .rpc_count_stats = ff_layout_read_count_stats, > > + .rpc_release = pnfs_generic_rw_release, > > +}; > > + > > +static const struct rpc_call_ops ff_layout_write_call_ops_v3 = { > > + .rpc_call_prepare = ff_layout_write_prepare_v3, > > + .rpc_call_done = ff_layout_write_call_done, > > + .rpc_count_stats = ff_layout_write_count_stats, > > + .rpc_release = pnfs_generic_rw_release, > > +}; > > + > > +static const struct rpc_call_ops ff_layout_write_call_ops_v4 = { > > + .rpc_call_prepare = ff_layout_write_prepare_v4, > > + .rpc_call_done = ff_layout_write_call_done, > > + .rpc_count_stats = ff_layout_write_count_stats, > > + .rpc_release = pnfs_generic_rw_release, > > +}; > > + > > +static const struct rpc_call_ops ff_layout_commit_call_ops_v3 = { > > + .rpc_call_prepare = ff_layout_commit_prepare_v3, > > + .rpc_call_done = pnfs_generic_write_commit_done, > > + .rpc_count_stats = ff_layout_commit_count_stats, > > + .rpc_release = pnfs_generic_commit_release, > > +}; > > + > > +static const struct rpc_call_ops ff_layout_commit_call_ops_v4 = { > > + .rpc_call_prepare = ff_layout_commit_prepare_v4, > > + .rpc_call_done = pnfs_generic_write_commit_done, > > + .rpc_count_stats = ff_layout_commit_count_stats, > > + .rpc_release = pnfs_generic_commit_release, > > +}; > > + > > +static enum pnfs_try_status > > +ff_layout_read_pagelist(struct nfs_pgio_header *hdr) > > +{ > > + struct pnfs_layout_segment *lseg = hdr->lseg; > > + struct nfs4_pnfs_ds *ds; > > + struct rpc_clnt *ds_clnt; > > + struct rpc_cred *ds_cred; > > + loff_t offset = hdr->args.offset; > > + u32 idx = hdr->pgio_mirror_idx; > > + int vers; > > + struct nfs_fh *fh; > > + > > + dprintk("--> %s ino %lu pgbase %u req %Zu@%llu\n", > > + __func__, hdr->inode->i_ino, > > + hdr->args.pgbase, (size_t)hdr->args.count, offset); > > + > > + ds = nfs4_ff_layout_prepare_ds(lseg, idx, false); > > + if (!ds) > > + goto out_failed; > > + > > + ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, > > + hdr->inode); > > + if (IS_ERR(ds_clnt)) > > + goto out_failed; > > + > > + ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); > > + if (IS_ERR(ds_cred)) > > + goto out_failed; > > + > > + vers = nfs4_ff_layout_ds_version(lseg, idx); > > + > > + dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__, > > + ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), vers); > > + > > + atomic_inc(&ds->ds_clp->cl_count); > > + hdr->ds_clp = ds->ds_clp; > > + fh = nfs4_ff_layout_select_ds_fh(lseg, idx); > > + if (fh) > > + hdr->args.fh = fh; > > + > > + /* > > + * Note that if we ever decide to split across DSes, > > + * then we may need to handle dense-like offsets. > > + */ > > + hdr->args.offset = offset; > > + hdr->mds_offset = offset; > > + > > + /* Perform an asynchronous read to ds */ > > + nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops, > > + vers == 3 ? &ff_layout_read_call_ops_v3 : > > + &ff_layout_read_call_ops_v4, > > + 0, RPC_TASK_SOFTCONN); > > + > > + return PNFS_ATTEMPTED; > > + > > +out_failed: > > + if (ff_layout_has_available_ds(lseg)) > > + return PNFS_TRY_AGAIN; > > + return PNFS_NOT_ATTEMPTED; > > +} > > + > > +/* Perform async writes. */ > > +static enum pnfs_try_status > > +ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) > > +{ > > + struct pnfs_layout_segment *lseg = hdr->lseg; > > + struct nfs4_pnfs_ds *ds; > > + struct rpc_clnt *ds_clnt; > > + struct rpc_cred *ds_cred; > > + loff_t offset = hdr->args.offset; > > + int vers; > > + struct nfs_fh *fh; > > + int idx = hdr->pgio_mirror_idx; > > + > > + ds = nfs4_ff_layout_prepare_ds(lseg, idx, true); > > + if (!ds) > > + return PNFS_NOT_ATTEMPTED; > > + > > + ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, > > + hdr->inode); > > + if (IS_ERR(ds_clnt)) > > + return PNFS_NOT_ATTEMPTED; > > + > > + ds_cred = ff_layout_get_ds_cred(lseg, idx, hdr->cred); > > + if (IS_ERR(ds_cred)) > > + return PNFS_NOT_ATTEMPTED; > > + > > + vers = nfs4_ff_layout_ds_version(lseg, idx); > > + > > + dprintk("%s ino %lu sync %d req %Zu@%llu DS: %s cl_count %d vers %d\n", > > + __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count, > > + offset, ds->ds_remotestr, atomic_read(&ds->ds_clp->cl_count), > > + vers); > > + > > + hdr->pgio_done_cb = ff_layout_write_done_cb; > > + atomic_inc(&ds->ds_clp->cl_count); > > + hdr->ds_clp = ds->ds_clp; > > + hdr->ds_commit_idx = idx; > > + fh = nfs4_ff_layout_select_ds_fh(lseg, idx); > > + if (fh) > > + hdr->args.fh = fh; > > + > > + /* > > + * Note that if we ever decide to split across DSes, > > + * then we may need to handle dense-like offsets. > > + */ > > + hdr->args.offset = offset; > > + > > + /* Perform an asynchronous write */ > > + nfs_initiate_pgio(ds_clnt, hdr, ds_cred, ds->ds_clp->rpc_ops, > > + vers == 3 ? &ff_layout_write_call_ops_v3 : > > + &ff_layout_write_call_ops_v4, > > + sync, RPC_TASK_SOFTCONN); > > + return PNFS_ATTEMPTED; > > +} > > + > > +static void > > +ff_layout_mark_request_commit(struct nfs_page *req, > > + struct pnfs_layout_segment *lseg, > > + struct nfs_commit_info *cinfo, > > + u32 ds_commit_idx) > > +{ > > + struct list_head *list; > > + struct pnfs_commit_bucket *buckets; > > + > > + spin_lock(cinfo->lock); > > + buckets = cinfo->ds->buckets; > > + list = &buckets[ds_commit_idx].written; > > + if (list_empty(list)) { > > + /* Non-empty buckets hold a reference on the lseg. That ref > > + * is normally transferred to the COMMIT call and released > > + * there. It could also be released if the last req is pulled > > + * off due to a rewrite, in which case it will be done in > > + * pnfs_common_clear_request_commit > > + */ > > + WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL); > > + buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg); > > + } > > + set_bit(PG_COMMIT_TO_DS, &req->wb_flags); > > + cinfo->ds->nwritten++; > > + > > + /* nfs_request_add_commit_list(). We need to add req to list without > > + * dropping cinfo lock. > > + */ > > + set_bit(PG_CLEAN, &(req)->wb_flags); > > + nfs_list_add_request(req, list); > > + cinfo->mds->ncommit++; > > + spin_unlock(cinfo->lock); > > + if (!cinfo->dreq) { > > + inc_zone_page_state(req->wb_page, NR_UNSTABLE_NFS); > > + inc_bdi_stat(page_file_mapping(req->wb_page)->backing_dev_info, > > + BDI_RECLAIMABLE); > > + __mark_inode_dirty(req->wb_context->dentry->d_inode, > > + I_DIRTY_DATASYNC); > > + } > > +} > > + > > +static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) > > +{ > > + return i; > > +} > > Is calc_ds_index_from_commit() this something that will be expanded on later? Ah, it took me a bit, but this is a copy of the file layout calc_ds_index_from_commit(). And we only support SPARSE striping. We could below simply state: idx = data->ds_commit_index; But I wanted to keep the same flow as the file layout. More than willing to change this now that we know we will not support DENSE layouts. > > > + > > +static struct nfs_fh * > > +select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i) > > +{ > > + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); > > + > > + /* FIXME: Assume that there is only one NFS version available > > + * for the DS. > > + */ > > + return &flseg->mirror_array[i]->fh_versions[0]; > > +} > > + > > +static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) > > +{ > > + struct pnfs_layout_segment *lseg = data->lseg; > > + struct nfs4_pnfs_ds *ds; > > + struct rpc_clnt *ds_clnt; > > + struct rpc_cred *ds_cred; > > + u32 idx; > > + int vers; > > + struct nfs_fh *fh; > > + > > + idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); > > + ds = nfs4_ff_layout_prepare_ds(lseg, idx, true); > > + if (!ds) > > + goto out_err; > > + > > + ds_clnt = nfs4_ff_find_or_create_ds_client(lseg, idx, ds->ds_clp, > > + data->inode); > > + if (IS_ERR(ds_clnt)) > > + goto out_err; > > + > > + ds_cred = ff_layout_get_ds_cred(lseg, idx, data->cred); > > + if (IS_ERR(ds_cred)) > > + goto out_err; > > + > > + vers = nfs4_ff_layout_ds_version(lseg, idx); > > + > > + dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__, > > + data->inode->i_ino, how, atomic_read(&ds->ds_clp->cl_count), > > + vers); > > + data->commit_done_cb = ff_layout_commit_done_cb; > > + data->cred = ds_cred; > > + atomic_inc(&ds->ds_clp->cl_count); > > + data->ds_clp = ds->ds_clp; > > + fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); > > + if (fh) > > + data->args.fh = fh; > > + return nfs_initiate_commit(ds_clnt, data, ds->ds_clp->rpc_ops, > > + vers == 3 ? &ff_layout_commit_call_ops_v3 : > > + &ff_layout_commit_call_ops_v4, > > + how, RPC_TASK_SOFTCONN); > > +out_err: > > + pnfs_generic_prepare_to_resend_writes(data); > > + pnfs_generic_commit_release(data); > > + return -EAGAIN; > > +} > > + > > +static int > > +ff_layout_commit_pagelist(struct inode *inode, struct list_head *mds_pages, > > + int how, struct nfs_commit_info *cinfo) > > +{ > > + return pnfs_generic_commit_pagelist(inode, mds_pages, how, cinfo, > > + ff_layout_initiate_commit); > > +} > > + > > +static struct pnfs_ds_commit_info * > > +ff_layout_get_ds_info(struct inode *inode) > > +{ > > + struct pnfs_layout_hdr *layout = NFS_I(inode)->layout; > > + > > + if (layout == NULL) > > + return NULL; > > + else > ^^^^ > Nit: We don't need the else here. Agreed. > > Thanks, > Anna > > > + return &FF_LAYOUT_FROM_HDR(layout)->commit_info; > > +} > > + > > +static void > > +ff_layout_free_deveiceid_node(struct nfs4_deviceid_node *d) > > +{ > > + nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds, > > + id_node)); > > +} > > + > > +static int ff_layout_encode_ioerr(struct nfs4_flexfile_layout *flo, > > + struct xdr_stream *xdr, > > + const struct nfs4_layoutreturn_args *args) > > +{ > > + struct pnfs_layout_hdr *hdr = &flo->generic_hdr; > > + __be32 *start; > > + int count = 0, ret = 0; > > + > > + start = xdr_reserve_space(xdr, 4); > > + if (unlikely(!start)) > > + return -E2BIG; > > + > > + /* This assume we always return _ALL_ layouts */ > > + spin_lock(&hdr->plh_inode->i_lock); > > + ret = ff_layout_encode_ds_ioerr(flo, xdr, &count, &args->range); > > + spin_unlock(&hdr->plh_inode->i_lock); > > + > > + *start = cpu_to_be32(count); > > + > > + return ret; > > +} > > + > > +/* report nothing for now */ > > +static void ff_layout_encode_iostats(struct nfs4_flexfile_layout *flo, > > + struct xdr_stream *xdr, > > + const struct nfs4_layoutreturn_args *args) > > +{ > > + __be32 *p; > > + > > + p = xdr_reserve_space(xdr, 4); > > + if (likely(p)) > > + *p = cpu_to_be32(0); > > +} > > + > > +static struct nfs4_deviceid_node * > > +ff_layout_alloc_deviceid_node(struct nfs_server *server, > > + struct pnfs_device *pdev, gfp_t gfp_flags) > > +{ > > + struct nfs4_ff_layout_ds *dsaddr; > > + > > + dsaddr = nfs4_ff_alloc_deviceid_node(server, pdev, gfp_flags); > > + if (!dsaddr) > > + return NULL; > > + return &dsaddr->id_node; > > +} > > + > > +static void > > +ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo, > > + struct xdr_stream *xdr, > > + const struct nfs4_layoutreturn_args *args) > > +{ > > + struct nfs4_flexfile_layout *flo = FF_LAYOUT_FROM_HDR(lo); > > + __be32 *start; > > + > > + dprintk("%s: Begin\n", __func__); > > + start = xdr_reserve_space(xdr, 4); > > + BUG_ON(!start); > > + > > + if (ff_layout_encode_ioerr(flo, xdr, args)) > > + goto out; > > + > > + ff_layout_encode_iostats(flo, xdr, args); > > +out: > > + *start = cpu_to_be32((xdr->p - start - 1) * 4); > > + dprintk("%s: Return\n", __func__); > > +} > > + > > +static struct pnfs_layoutdriver_type flexfilelayout_type = { > > + .id = LAYOUT_FLEX_FILES, > > + .name = "LAYOUT_FLEX_FILES", > > + .owner = THIS_MODULE, > > + .alloc_layout_hdr = ff_layout_alloc_layout_hdr, > > + .free_layout_hdr = ff_layout_free_layout_hdr, > > + .alloc_lseg = ff_layout_alloc_lseg, > > + .free_lseg = ff_layout_free_lseg, > > + .pg_read_ops = &ff_layout_pg_read_ops, > > + .pg_write_ops = &ff_layout_pg_write_ops, > > + .get_ds_info = ff_layout_get_ds_info, > > + .free_deviceid_node = ff_layout_free_deveiceid_node, > > + .mark_request_commit = ff_layout_mark_request_commit, > > + .clear_request_commit = pnfs_generic_clear_request_commit, > > + .scan_commit_lists = pnfs_generic_scan_commit_lists, > > + .recover_commit_reqs = pnfs_generic_recover_commit_reqs, > > + .commit_pagelist = ff_layout_commit_pagelist, > > + .read_pagelist = ff_layout_read_pagelist, > > + .write_pagelist = ff_layout_write_pagelist, > > + .alloc_deviceid_node = ff_layout_alloc_deviceid_node, > > + .encode_layoutreturn = ff_layout_encode_layoutreturn, > > +}; > > + > > +static int __init nfs4flexfilelayout_init(void) > > +{ > > + printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Registering...\n", > > + __func__); > > + return pnfs_register_layoutdriver(&flexfilelayout_type); > > +} > > + > > +static void __exit nfs4flexfilelayout_exit(void) > > +{ > > + printk(KERN_INFO "%s: NFSv4 Flexfile Layout Driver Unregistering...\n", > > + __func__); > > + pnfs_unregister_layoutdriver(&flexfilelayout_type); > > +} > > + > > +MODULE_ALIAS("nfs-layouttype4-4"); > > + > > +MODULE_LICENSE("GPL"); > > +MODULE_DESCRIPTION("The NFSv4 flexfile layout driver"); > > + > > +module_init(nfs4flexfilelayout_init); > > +module_exit(nfs4flexfilelayout_exit); > > diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h > > new file mode 100644 > > index 0000000..712fc55 > > --- /dev/null > > +++ b/fs/nfs/flexfilelayout/flexfilelayout.h > > @@ -0,0 +1,158 @@ > > +/* > > + * NFSv4 flexfile layout driver data structures. > > + * > > + * Copyright (c) 2014, Primary Data, Inc. All rights reserved. > > + * > > + * Tao Peng <bergwolf@xxxxxxxxxxxxxxx> > > + */ > > + > > +#ifndef FS_NFS_NFS4FLEXFILELAYOUT_H > > +#define FS_NFS_NFS4FLEXFILELAYOUT_H > > + > > +#include "../pnfs.h" > > + > > +/* XXX: Let's filter out insanely large mirror count for now to avoid oom > > + * due to network error etc. */ > > +#define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096 > > + > > +struct nfs4_ff_ds_version { > > + u32 version; > > + u32 minor_version; > > + u32 rsize; > > + u32 wsize; > > + bool tightly_coupled; > > +}; > > + > > +/* chained in global deviceid hlist */ > > +struct nfs4_ff_layout_ds { > > + struct nfs4_deviceid_node id_node; > > + u32 ds_versions_cnt; > > + struct nfs4_ff_ds_version *ds_versions; > > + struct nfs4_pnfs_ds *ds; > > +}; > > + > > +struct nfs4_ff_layout_ds_err { > > + struct list_head list; /* linked in mirror error_list */ > > + u64 offset; > > + u64 length; > > + int status; > > + enum nfs_opnum4 opnum; > > + nfs4_stateid stateid; > > + struct nfs4_deviceid deviceid; > > +}; > > + > > +struct nfs4_ff_layout_mirror { > > + u32 ds_count; > > + u32 efficiency; > > + struct nfs4_ff_layout_ds *mirror_ds; > > + u32 fh_versions_cnt; > > + struct nfs_fh *fh_versions; > > + nfs4_stateid stateid; > > + union { > > + struct { /* same as struct unx_cred */ > > + u32 uid; /* -1 iff AUTH_NONE */ > > + u32 gid; /* -1 iff AUTH_NONE */ > > + u32 gids[16]; > > + }; > > + }; > > + struct rpc_cred *cred; > > + spinlock_t lock; > > +}; > > + > > +struct nfs4_ff_layout_segment { > > + struct pnfs_layout_segment generic_hdr; > > + u64 stripe_unit; > > + u32 mirror_array_cnt; > > + struct nfs4_ff_layout_mirror **mirror_array; > > +}; > > + > > +struct nfs4_flexfile_layout { > > + struct pnfs_layout_hdr generic_hdr; > > + struct pnfs_ds_commit_info commit_info; > > + struct list_head error_list; /* nfs4_ff_layout_ds_err */ > > +}; > > + > > +static inline struct nfs4_flexfile_layout * > > +FF_LAYOUT_FROM_HDR(struct pnfs_layout_hdr *lo) > > +{ > > + return container_of(lo, struct nfs4_flexfile_layout, generic_hdr); > > +} > > + > > +static inline struct nfs4_ff_layout_segment * > > +FF_LAYOUT_LSEG(struct pnfs_layout_segment *lseg) > > +{ > > + return container_of(lseg, > > + struct nfs4_ff_layout_segment, > > + generic_hdr); > > +} > > + > > +static inline struct nfs4_deviceid_node * > > +FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx) > > +{ > > + if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt || > > + FF_LAYOUT_LSEG(lseg)->mirror_array[idx] == NULL || > > + FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds == NULL) > > + return NULL; > > + return &FF_LAYOUT_LSEG(lseg)->mirror_array[idx]->mirror_ds->id_node; > > +} > > + > > +static inline struct nfs4_ff_layout_ds * > > +FF_LAYOUT_MIRROR_DS(struct nfs4_deviceid_node *node) > > +{ > > + return container_of(node, struct nfs4_ff_layout_ds, id_node); > > +} > > + > > +static inline struct nfs4_ff_layout_mirror * > > +FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx) > > +{ > > + if (idx >= FF_LAYOUT_LSEG(lseg)->mirror_array_cnt) > > + return NULL; > > + return FF_LAYOUT_LSEG(lseg)->mirror_array[idx]; > > +} > > + > > +static inline u32 > > +FF_LAYOUT_MIRROR_COUNT(struct pnfs_layout_segment *lseg) > > +{ > > + return FF_LAYOUT_LSEG(lseg)->mirror_array_cnt; > > +} > > + > > +static inline bool > > +ff_layout_test_devid_unavailable(struct nfs4_deviceid_node *node) > > +{ > > + return nfs4_test_deviceid_unavailable(node); > > +} > > + > > +static inline int > > +nfs4_ff_layout_ds_version(struct pnfs_layout_segment *lseg, u32 ds_idx) > > +{ > > + return FF_LAYOUT_COMP(lseg, ds_idx)->mirror_ds->ds_versions[0].version; > > +} > > + > > +struct nfs4_ff_layout_ds * > > +nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, > > + gfp_t gfp_flags); > > +void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds); > > +void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds); > > +int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, > > + struct nfs4_ff_layout_mirror *mirror, u64 offset, > > + u64 length, int status, enum nfs_opnum4 opnum, > > + gfp_t gfp_flags); > > +int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo, > > + struct xdr_stream *xdr, int *count, > > + const struct pnfs_layout_range *range); > > +struct nfs_fh * > > +nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx); > > + > > +struct nfs4_pnfs_ds * > > +nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, > > + bool fail_return); > > + > > +struct rpc_clnt * > > +nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, > > + u32 ds_idx, > > + struct nfs_client *ds_clp, > > + struct inode *inode); > > +struct rpc_cred *ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, > > + u32 ds_idx, struct rpc_cred *mdscred); > > +bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg); > > +#endif /* FS_NFS_NFS4FLEXFILELAYOUT_H */ > > diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c > > new file mode 100644 > > index 0000000..5dae5c2 > > --- /dev/null > > +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c > > @@ -0,0 +1,552 @@ > > +/* > > + * Device operations for the pnfs nfs4 file layout driver. > > + * > > + * Copyright (c) 2014, Primary Data, Inc. All rights reserved. > > + * > > + * Tao Peng <bergwolf@xxxxxxxxxxxxxxx> > > + */ > > + > > +#include <linux/nfs_fs.h> > > +#include <linux/vmalloc.h> > > +#include <linux/module.h> > > +#include <linux/sunrpc/addr.h> > > + > > +#include "../internal.h" > > +#include "../nfs4session.h" > > +#include "flexfilelayout.h" > > + > > +#define NFSDBG_FACILITY NFSDBG_PNFS_LD > > + > > +static unsigned int dataserver_timeo = NFS4_DEF_DS_TIMEO; > > +static unsigned int dataserver_retrans = NFS4_DEF_DS_RETRANS; > > + > > +void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds) > > +{ > > + if (mirror_ds) > > + nfs4_put_deviceid_node(&mirror_ds->id_node); > > +} > > + > > +void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds) > > +{ > > + nfs4_print_deviceid(&mirror_ds->id_node.deviceid); > > + nfs4_pnfs_ds_put(mirror_ds->ds); > > + kfree(mirror_ds); > > +} > > + > > +/* Decode opaque device data and construct new_ds using it */ > > +struct nfs4_ff_layout_ds * > > +nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, > > + gfp_t gfp_flags) > > +{ > > + struct xdr_stream stream; > > + struct xdr_buf buf; > > + struct page *scratch; > > + struct list_head dsaddrs; > > + struct nfs4_pnfs_ds_addr *da; > > + struct nfs4_ff_layout_ds *new_ds = NULL; > > + struct nfs4_ff_ds_version *ds_versions = NULL; > > + u32 mp_count; > > + u32 version_count; > > + __be32 *p; > > + int i, ret = -ENOMEM; > > + > > + /* set up xdr stream */ > > + scratch = alloc_page(gfp_flags); > > + if (!scratch) > > + goto out_err; > > + > > + new_ds = kzalloc(sizeof(struct nfs4_ff_layout_ds), gfp_flags); > > + if (!new_ds) > > + goto out_scratch; > > + > > + nfs4_init_deviceid_node(&new_ds->id_node, > > + server, > > + &pdev->dev_id); > > + INIT_LIST_HEAD(&dsaddrs); > > + > > + xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen); > > + xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); > > + > > + /* multipath count */ > > + p = xdr_inline_decode(&stream, 4); > > + if (unlikely(!p)) > > + goto out_err_drain_dsaddrs; > > + mp_count = be32_to_cpup(p); > > + dprintk("%s: multipath ds count %d\n", __func__, mp_count); > > + > > + for (i = 0; i < mp_count; i++) { > > + /* multipath ds */ > > + da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net, > > + &stream, gfp_flags); > > + if (da) > > + list_add_tail(&da->da_node, &dsaddrs); > > + } > > + if (list_empty(&dsaddrs)) { > > + dprintk("%s: no suitable DS addresses found\n", > > + __func__); > > + ret = -ENOMEDIUM; > > + goto out_err_drain_dsaddrs; > > + } > > + > > + /* version count */ > > + p = xdr_inline_decode(&stream, 4); > > + if (unlikely(!p)) > > + goto out_err_drain_dsaddrs; > > + version_count = be32_to_cpup(p); > > + dprintk("%s: version count %d\n", __func__, version_count); > > + > > + ds_versions = kzalloc(version_count * sizeof(struct nfs4_ff_ds_version), > > + gfp_flags); > > + if (!ds_versions) > > + goto out_scratch; > > + > > + for (i = 0; i < version_count; i++) { > > + /* 20 = version(4) + minor_version(4) + rsize(4) + wsize(4) + > > + * tightly_coupled(4) */ > > + p = xdr_inline_decode(&stream, 20); > > + if (unlikely(!p)) > > + goto out_err_drain_dsaddrs; > > + ds_versions[i].version = be32_to_cpup(p++); > > + ds_versions[i].minor_version = be32_to_cpup(p++); > > + ds_versions[i].rsize = nfs_block_size(be32_to_cpup(p++), NULL); > > + ds_versions[i].wsize = nfs_block_size(be32_to_cpup(p++), NULL); > > + ds_versions[i].tightly_coupled = be32_to_cpup(p); > > + > > + if (ds_versions[i].rsize > NFS_MAX_FILE_IO_SIZE) > > + ds_versions[i].rsize = NFS_MAX_FILE_IO_SIZE; > > + if (ds_versions[i].wsize > NFS_MAX_FILE_IO_SIZE) > > + ds_versions[i].wsize = NFS_MAX_FILE_IO_SIZE; > > + > > + if (ds_versions[i].version != 3 || ds_versions[i].minor_version != 0) { > > + dprintk("%s: [%d] unsupported ds version %d-%d\n", __func__, > > + i, ds_versions[i].version, > > + ds_versions[i].minor_version); > > + ret = -EPROTONOSUPPORT; > > + goto out_err_drain_dsaddrs; > > + } > > + > > + dprintk("%s: [%d] vers %u minor_ver %u rsize %u wsize %u coupled %d\n", > > + __func__, i, ds_versions[i].version, > > + ds_versions[i].minor_version, > > + ds_versions[i].rsize, > > + ds_versions[i].wsize, > > + ds_versions[i].tightly_coupled); > > + } > > + > > + new_ds->ds_versions = ds_versions; > > + new_ds->ds_versions_cnt = version_count; > > + > > + new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); > > + if (!new_ds->ds) > > + goto out_err_drain_dsaddrs; > > + > > + /* If DS was already in cache, free ds addrs */ > > + while (!list_empty(&dsaddrs)) { > > + da = list_first_entry(&dsaddrs, > > + struct nfs4_pnfs_ds_addr, > > + da_node); > > + list_del_init(&da->da_node); > > + kfree(da->da_remotestr); > > + kfree(da); > > + } > > + > > + __free_page(scratch); > > + return new_ds; > > + > > +out_err_drain_dsaddrs: > > + while (!list_empty(&dsaddrs)) { > > + da = list_first_entry(&dsaddrs, struct nfs4_pnfs_ds_addr, > > + da_node); > > + list_del_init(&da->da_node); > > + kfree(da->da_remotestr); > > + kfree(da); > > + } > > + > > + kfree(ds_versions); > > +out_scratch: > > + __free_page(scratch); > > +out_err: > > + kfree(new_ds); > > + > > + dprintk("%s ERROR: returning %d\n", __func__, ret); > > + return NULL; > > +} > > + > > +static u64 > > +end_offset(u64 start, u64 len) > > +{ > > + u64 end; > > + > > + end = start + len; > > + return end >= start ? end : NFS4_MAX_UINT64; > > +} > > + > > +static void extend_ds_error(struct nfs4_ff_layout_ds_err *err, > > + u64 offset, u64 length) > > +{ > > + u64 end; > > + > > + end = max_t(u64, end_offset(err->offset, err->length), > > + end_offset(offset, length)); > > + err->offset = min_t(u64, err->offset, offset); > > + err->length = end - err->offset; > > +} > > + > > +static bool ds_error_can_merge(struct nfs4_ff_layout_ds_err *err, u64 offset, > > + u64 length, int status, enum nfs_opnum4 opnum, > > + nfs4_stateid *stateid, > > + struct nfs4_deviceid *deviceid) > > +{ > > + return err->status == status && err->opnum == opnum && > > + nfs4_stateid_match(&err->stateid, stateid) && > > + !memcmp(&err->deviceid, deviceid, sizeof(*deviceid)) && > > + end_offset(err->offset, err->length) >= offset && > > + err->offset <= end_offset(offset, length); > > +} > > + > > +static bool merge_ds_error(struct nfs4_ff_layout_ds_err *old, > > + struct nfs4_ff_layout_ds_err *new) > > +{ > > + if (!ds_error_can_merge(old, new->offset, new->length, new->status, > > + new->opnum, &new->stateid, &new->deviceid)) > > + return false; > > + > > + extend_ds_error(old, new->offset, new->length); > > + return true; > > +} > > + > > +static bool > > +ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo, > > + struct nfs4_ff_layout_ds_err *dserr) > > +{ > > + struct nfs4_ff_layout_ds_err *err; > > + > > + list_for_each_entry(err, &flo->error_list, list) { > > + if (merge_ds_error(err, dserr)) { > > + return true; > > + } > > + } > > + > > + list_add(&dserr->list, &flo->error_list); > > + return false; > > +} > > + > > +static bool > > +ff_layout_update_ds_error(struct nfs4_flexfile_layout *flo, u64 offset, > > + u64 length, int status, enum nfs_opnum4 opnum, > > + nfs4_stateid *stateid, struct nfs4_deviceid *deviceid) > > +{ > > + bool found = false; > > + struct nfs4_ff_layout_ds_err *err; > > + > > + list_for_each_entry(err, &flo->error_list, list) { > > + if (ds_error_can_merge(err, offset, length, status, opnum, > > + stateid, deviceid)) { > > + found = true; > > + extend_ds_error(err, offset, length); > > + break; > > + } > > + } > > + > > + return found; > > +} > > + > > +int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, > > + struct nfs4_ff_layout_mirror *mirror, u64 offset, > > + u64 length, int status, enum nfs_opnum4 opnum, > > + gfp_t gfp_flags) > > +{ > > + struct nfs4_ff_layout_ds_err *dserr; > > + bool needfree; > > + > > + if (status == 0) > > + return 0; > > + > > + if (mirror->mirror_ds == NULL) > > + return -EINVAL; > > + > > + spin_lock(&flo->generic_hdr.plh_inode->i_lock); > > + if (ff_layout_update_ds_error(flo, offset, length, status, opnum, > > + &mirror->stateid, > > + &mirror->mirror_ds->id_node.deviceid)) { > > + spin_unlock(&flo->generic_hdr.plh_inode->i_lock); > > + return 0; > > + } > > + spin_unlock(&flo->generic_hdr.plh_inode->i_lock); > > + dserr = kmalloc(sizeof(*dserr), gfp_flags); > > + if (!dserr) > > + return -ENOMEM; > > + > > + INIT_LIST_HEAD(&dserr->list); > > + dserr->offset = offset; > > + dserr->length = length; > > + dserr->status = status; > > + dserr->opnum = opnum; > > + nfs4_stateid_copy(&dserr->stateid, &mirror->stateid); > > + memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid, > > + NFS4_DEVICEID4_SIZE); > > + > > + spin_lock(&flo->generic_hdr.plh_inode->i_lock); > > + needfree = ff_layout_add_ds_error_locked(flo, dserr); > > + spin_unlock(&flo->generic_hdr.plh_inode->i_lock); > > + if (needfree) > > + kfree(dserr); > > + > > + return 0; > > +} > > + > > +/* currently we only support AUTH_NONE and AUTH_SYS */ > > +static rpc_authflavor_t > > +nfs4_ff_layout_choose_authflavor(struct nfs4_ff_layout_mirror *mirror) > > +{ > > + if (mirror->uid == (u32)-1) > > + return RPC_AUTH_NULL; > > + return RPC_AUTH_UNIX; > > +} > > + > > +/* fetch cred for NFSv3 DS */ > > +static int ff_layout_update_mirror_cred(struct nfs4_ff_layout_mirror *mirror, > > + struct nfs4_pnfs_ds *ds) > > +{ > > + if (ds && !mirror->cred && mirror->mirror_ds->ds_versions[0].version == 3) { > > + struct rpc_auth *auth = ds->ds_clp->cl_rpcclient->cl_auth; > > + struct rpc_cred *cred; > > + struct auth_cred acred = { > > + .uid = make_kuid(&init_user_ns, mirror->uid), > > + .gid = make_kgid(&init_user_ns, mirror->gid), > > + }; > > + > > + /* AUTH_NULL ignores acred */ > > + cred = auth->au_ops->lookup_cred(auth, &acred, 0); > > + if (IS_ERR(cred)) { > > + dprintk("%s: lookup_cred failed with %ld\n", > > + __func__, PTR_ERR(cred)); > > + return PTR_ERR(cred); > > + } else { > > + mirror->cred = cred; > > + } > > + } > > + return 0; > > +} > > + > > +struct nfs_fh * > > +nfs4_ff_layout_select_ds_fh(struct pnfs_layout_segment *lseg, u32 mirror_idx) > > +{ > > + struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, mirror_idx); > > + struct nfs_fh *fh = NULL; > > + struct nfs4_deviceid_node *devid; > > + > > + if (mirror == NULL || mirror->mirror_ds == NULL || > > + mirror->mirror_ds->ds == NULL) { > > + printk(KERN_ERR "NFS: %s: No data server for mirror offset index %d\n", > > + __func__, mirror_idx); > > + if (mirror && mirror->mirror_ds) { > > + devid = &mirror->mirror_ds->id_node; > > + pnfs_generic_mark_devid_invalid(devid); > > + } > > + goto out; > > + } > > + > > + /* FIXME: For now assume there is only 1 version available for the DS */ > > + fh = &mirror->fh_versions[0]; > > +out: > > + return fh; > > +} > > + > > +/* Upon return, either ds is connected, or ds is NULL */ > > +struct nfs4_pnfs_ds * > > +nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx, > > + bool fail_return) > > +{ > > + struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); > > + struct nfs4_pnfs_ds *ds = NULL; > > + struct nfs4_deviceid_node *devid; > > + struct inode *ino = lseg->pls_layout->plh_inode; > > + struct nfs_server *s = NFS_SERVER(ino); > > + unsigned int max_payload; > > + rpc_authflavor_t flavor; > > + > > + if (mirror == NULL || mirror->mirror_ds == NULL || > > + mirror->mirror_ds->ds == NULL) { > > + printk(KERN_ERR "NFS: %s: No data server for offset index %d\n", > > + __func__, ds_idx); > > + if (mirror && mirror->mirror_ds) { > > + devid = &mirror->mirror_ds->id_node; > > + pnfs_generic_mark_devid_invalid(devid); > > + } > > + goto out; > > + } > > + > > + ds = mirror->mirror_ds->ds; > > + devid = &mirror->mirror_ds->id_node; > > + > > + /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ > > + smp_rmb(); > > + if (ds->ds_clp) > > + goto out_test_devid; > > + > > + flavor = nfs4_ff_layout_choose_authflavor(mirror); > > + > > + /* FIXME: For now we assume the server sent only one version of NFS > > + * to use for the DS. > > + */ > > + nfs4_pnfs_ds_connect(s, ds, devid, dataserver_timeo, > > + dataserver_retrans, > > + mirror->mirror_ds->ds_versions[0].version, > > + mirror->mirror_ds->ds_versions[0].minor_version, > > + flavor); > > + > > + /* connect success, check rsize/wsize limit */ > > + if (ds->ds_clp) { > > + max_payload = > > + nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient), > > + NULL); > > + if (mirror->mirror_ds->ds_versions[0].rsize > max_payload) > > + mirror->mirror_ds->ds_versions[0].rsize = max_payload; > > + if (mirror->mirror_ds->ds_versions[0].wsize > max_payload) > > + mirror->mirror_ds->ds_versions[0].wsize = max_payload; > > + } else { > > + ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), > > + mirror, lseg->pls_range.offset, > > + lseg->pls_range.length, NFS4ERR_NXIO, > > + OP_ILLEGAL, GFP_NOIO); > > + if (fail_return) { > > + pnfs_error_mark_layout_for_return(ino, lseg); > > + if (ff_layout_has_available_ds(lseg)) > > + pnfs_set_retry_layoutget(lseg->pls_layout); > > + else > > + pnfs_clear_retry_layoutget(lseg->pls_layout); > > + > > + } else { > > + if (ff_layout_has_available_ds(lseg)) > > + set_bit(NFS_LAYOUT_RETURN_BEFORE_CLOSE, > > + &lseg->pls_layout->plh_flags); > > + else { > > + pnfs_error_mark_layout_for_return(ino, lseg); > > + pnfs_clear_retry_layoutget(lseg->pls_layout); > > + } > > + } > > + } > > + > > +out_test_devid: > > + if (ff_layout_test_devid_unavailable(devid)) > > + ds = NULL; > > +out: > > + if (ff_layout_update_mirror_cred(mirror, ds)) > > + ds = NULL; > > + return ds; > > +} > > + > > +struct rpc_cred * > > +ff_layout_get_ds_cred(struct pnfs_layout_segment *lseg, u32 ds_idx, > > + struct rpc_cred *mdscred) > > +{ > > + struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); > > + struct rpc_cred *cred = ERR_PTR(-EINVAL); > > + > > + if (!nfs4_ff_layout_prepare_ds(lseg, ds_idx, true)) > > + goto out; > > + > > + if (mirror && mirror->cred) > > + cred = mirror->cred; > > + else > > + cred = mdscred; > > +out: > > + return cred; > > +} > > + > > +/** > > +* Find or create a DS rpc client with th MDS server rpc client auth flavor > > +* in the nfs_client cl_ds_clients list. > > +*/ > > +struct rpc_clnt * > > +nfs4_ff_find_or_create_ds_client(struct pnfs_layout_segment *lseg, u32 ds_idx, > > + struct nfs_client *ds_clp, struct inode *inode) > > +{ > > + struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); > > + > > + switch (mirror->mirror_ds->ds_versions[0].version) { > > + case 3: > > + /* For NFSv3 DS, flavor is set when creating DS connections */ > > + return ds_clp->cl_rpcclient; > > + case 4: > > + return nfs4_find_or_create_ds_client(ds_clp, inode); > > + default: > > + BUG(); > > + } > > +} > > + > > +static bool is_range_intersecting(u64 offset1, u64 length1, > > + u64 offset2, u64 length2) > > +{ > > + u64 end1 = end_offset(offset1, length1); > > + u64 end2 = end_offset(offset2, length2); > > + > > + return (end1 == NFS4_MAX_UINT64 || end1 > offset2) && > > + (end2 == NFS4_MAX_UINT64 || end2 > offset1); > > +} > > + > > +/* called with inode i_lock held */ > > +int ff_layout_encode_ds_ioerr(struct nfs4_flexfile_layout *flo, > > + struct xdr_stream *xdr, int *count, > > + const struct pnfs_layout_range *range) > > +{ > > + struct nfs4_ff_layout_ds_err *err, *n; > > + __be32 *p; > > + > > + list_for_each_entry_safe(err, n, &flo->error_list, list) { > > + if (!is_range_intersecting(err->offset, err->length, > > + range->offset, range->length)) > > + continue; > > + /* offset(8) + length(8) + stateid(NFS4_STATEID_SIZE) > > + * + deviceid(NFS4_DEVICEID4_SIZE) + status(4) + opnum(4) > > + */ > > + p = xdr_reserve_space(xdr, > > + 24 + NFS4_STATEID_SIZE + NFS4_DEVICEID4_SIZE); > > + if (unlikely(!p)) > > + return -ENOBUFS; > > + p = xdr_encode_hyper(p, err->offset); > > + p = xdr_encode_hyper(p, err->length); > > + p = xdr_encode_opaque_fixed(p, &err->stateid, > > + NFS4_STATEID_SIZE); > > + p = xdr_encode_opaque_fixed(p, &err->deviceid, > > + NFS4_DEVICEID4_SIZE); > > + *p++ = cpu_to_be32(err->status); > > + *p++ = cpu_to_be32(err->opnum); > > + *count += 1; > > + list_del(&err->list); > > + kfree(err); > > + dprintk("%s: offset %llu length %llu status %d op %d count %d\n", > > + __func__, err->offset, err->length, err->status, > > + err->opnum, *count); > > + } > > + > > + return 0; > > +} > > + > > +bool ff_layout_has_available_ds(struct pnfs_layout_segment *lseg) > > +{ > > + struct nfs4_ff_layout_mirror *mirror; > > + struct nfs4_deviceid_node *devid; > > + int idx; > > + > > + for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { > > + mirror = FF_LAYOUT_COMP(lseg, idx); > > + if (mirror && mirror->mirror_ds) { > > + devid = &mirror->mirror_ds->id_node; > > + if (!ff_layout_test_devid_unavailable(devid)) > > + return true; > > + } > > + } > > + > > + return false; > > +} > > + > > +module_param(dataserver_retrans, uint, 0644); > > +MODULE_PARM_DESC(dataserver_retrans, "The number of times the NFSv4.1 client " > > + "retries a request before it attempts further " > > + " recovery action."); > > +module_param(dataserver_timeo, uint, 0644); > > +MODULE_PARM_DESC(dataserver_timeo, "The time (in tenths of a second) the " > > + "NFSv4.1 client waits for a response from a " > > + " data server before it retries an NFS request."); > > diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h > > index 022b761..de7c91c 100644 > > --- a/include/linux/nfs4.h > > +++ b/include/linux/nfs4.h > > @@ -516,6 +516,7 @@ enum pnfs_layouttype { > > LAYOUT_NFSV4_1_FILES = 1, > > LAYOUT_OSD2_OBJECTS = 2, > > LAYOUT_BLOCK_VOLUME = 3, > > + LAYOUT_FLEX_FILES = 4, > > }; > > > > /* used for both layout return and recall */ > > > -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html