Re: [PATCH 14/18] nfsd: pNFS block layout driver

Tom Haynes <thomas.haynes@xxxxxxxxxxxxxxx> · Mon, 12 Jan 2015 01:14:19 -0500

On Tue, Jan 06, 2015 at 05:28:37PM +0100, Christoph Hellwig wrote:
> Add a small shim between core nfsd and filesystems to translate the
> somewhat cumbersome pNFS data structures and semantics to something
> more palatable for Linux filesystems.
> 
> Signed-off-by: Christoph Hellwig <hch@xxxxxx>
> ---
>  .../filesystems/nfs/pnfs-block-server.txt          |  40 +++++
>  fs/nfsd/Makefile                                   |   2 +-
>  fs/nfsd/blocklayout.c                              | 194 +++++++++++++++++++++
>  fs/nfsd/blocklayoutxdr.c                           | 157 +++++++++++++++++
>  fs/nfsd/blocklayoutxdr.h                           |  62 +++++++
>  fs/nfsd/nfs4layouts.c                              |   7 +
>  fs/nfsd/pnfs.h                                     |   1 +
>  7 files changed, 462 insertions(+), 1 deletion(-)
>  create mode 100644 Documentation/filesystems/nfs/pnfs-block-server.txt
>  create mode 100644 fs/nfsd/blocklayout.c
>  create mode 100644 fs/nfsd/blocklayoutxdr.c
>  create mode 100644 fs/nfsd/blocklayoutxdr.h
> 
> diff --git a/Documentation/filesystems/nfs/pnfs-block-server.txt b/Documentation/filesystems/nfs/pnfs-block-server.txt
> new file mode 100644
> index 0000000..f45d399
> --- /dev/null
> +++ b/Documentation/filesystems/nfs/pnfs-block-server.txt
> @@ -0,0 +1,40 @@
> +pNFS block layout server user guide
> +
> +The Linux NFS server now supports the pNFS block layout extension.  In this
> +case the NFS server acts as Metadata Server (MDS) for pNFS, which in addition
> +to handling all the metadata access to the NFS export also hands out layouts
> +to the clients to directly access the underlying block devices that is

to the clients. The layout allows the client to directly access the underlying block devices that (are)

> +shared with the client.  Note that there are no Data Servers (DSs) in the
> +block layout flavor of pNFS.

Which is why the spec calls them storage devices. 

> +
> +To use pNFS block layouts with with the Linux NFS server the exported file
> +system needs to support the pNFS block layouts (current just XFS), and the

currently 

> +file system must sit on shared storage (typically iSCSI) that is accessible
> +to the clients as well as the server.  The file system needs to either sit
> +directly on the exported volume, or on a RAID 0 using the MD software RAID

a RAID 0 what?

> +driver with the version 1 superblock format.  If the filesystem uses sits

In general, /filesystem/file system/

/filesystem uses/file system it uses/

> +on a RAID 0 device the clients will automatically stripe their I/O over
> +multiple LUNs.
> +
> +On the server pNFS block volume support is automatically if the file system
> +support its.  On the client make sure the kernel has the CONFIG_PNFS_BLOCK

/its/it/

> +option enabled, the blkmapd daemon from nfs-utils is running, and the
> +file system, is mounted using the NFSv4.1 protocol version (mount -o vers=4.1).

/system, is/system is/

> +
> +If the nfsd server needs to fence a non-responding client it calls
> +/sbin/nfsd-recall-failed with the first argument set to the IP address of
> +the client, and the second argument set to the device node without the /dev
> +prefix for the filesystem to be fenced. Below is an example file that show

/show/shows/

> +how to translate the device into a serial number from SCSI EVPD 0x80:
> +
> +cat > /sbin/nfsd-recall-failed << EOF
> +#!/bin/sh
> +
> +CLIENT="$1"
> +DEV="/dev/$2"
> +EVPD=`sg_inq --page=0x80 ${DEV} | \
> +	grep "Unit serial number:" | \
> +	awk -F ': ' '{print $2}'`
> +
> +echo "fencing client ${CLIENT} serial ${EVPD}" >> /var/log/pnfsd-fence.log
> +EOF
> diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
> index 6cba933..9a6028e 100644
> --- a/fs/nfsd/Makefile
> +++ b/fs/nfsd/Makefile
> @@ -17,4 +17,4 @@ nfsd-$(CONFIG_NFSD_V3)	+= nfs3proc.o nfs3xdr.o
>  nfsd-$(CONFIG_NFSD_V3_ACL) += nfs3acl.o
>  nfsd-$(CONFIG_NFSD_V4)	+= nfs4proc.o nfs4xdr.o nfs4state.o nfs4idmap.o \
>  			   nfs4acl.o nfs4callback.o nfs4recover.o
> -nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o
> +nfsd-$(CONFIG_NFSD_PNFS) += nfs4layouts.o blocklayout.o blocklayoutxdr.o
> diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c
> new file mode 100644
> index 0000000..a14e358
> --- /dev/null
> +++ b/fs/nfsd/blocklayout.c
> @@ -0,0 +1,194 @@
> +/*
> + * Copyright (c) 2014 Christoph Hellwig.
> + */
> +#include <linux/exportfs.h>
> +#include <linux/genhd.h>
> +#include <linux/slab.h>
> +#include <linux/raid_class.h>
> +
> +#include <linux/nfsd/debug.h>
> +
> +#include "blocklayoutxdr.h"
> +#include "pnfs.h"
> +
> +#define NFSDDBG_FACILITY	NFSDDBG_PNFS
> +
> +
> +static int
> +nfsd4_block_get_device_info_simple(struct super_block *sb,
> +		struct nfsd4_getdeviceinfo *gdp)
> +{
> +	struct pnfs_block_deviceaddr *dev;
> +	struct pnfs_block_volume *b;
> +
> +	dev = kzalloc(sizeof(struct pnfs_block_deviceaddr) +
> +		      sizeof(struct pnfs_block_volume), GFP_KERNEL);
> +	if (!dev)
> +		return -ENOMEM;
> +	gdp->gd_device = dev;
> +
> +	dev->nr_volumes = 1;
> +	b = &dev->volumes[0];
> +
> +	b->type = PNFS_BLOCK_VOLUME_SIMPLE;
> +	b->simple.sig_len = PNFS_BLOCK_UUID_LEN;
> +	return sb->s_export_op->get_uuid(sb, b->simple.sig, &b->simple.sig_len,
> +			&b->simple.offset);
> +}
> +
> +static __be32
> +nfsd4_block_proc_getdeviceinfo(struct super_block *sb,
> +		struct nfsd4_getdeviceinfo *gdp)
> +{
> +	if (sb->s_bdev != sb->s_bdev->bd_contains)
> +		return nfserr_inval;
> +	return nfserrno(nfsd4_block_get_device_info_simple(sb, gdp));
> +}
> +
> +static __be32
> +nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp,
> +		struct nfsd4_layoutget *args)
> +{
> +	struct nfsd4_layout_seg *seg = &args->lg_seg;
> +	struct super_block *sb = inode->i_sb;
> +	u32 block_size = (1 << inode->i_blkbits);
> +	struct pnfs_block_extent *bex;
> +	struct iomap iomap;
> +	u32 device_generation = 0;
> +	int error;
> +
> +	/*
> +	 * We do not attempt to support I/O smaller than the fs block size,
> +	 * or not aligned to it.
> +	 */
> +	if (args->lg_minlength < block_size) {
> +		dprintk("pnfsd: I/O too small\n");
> +		goto out_layoutunavailable;
> +	}
> +	if (seg->offset & (block_size - 1)) {
> +		dprintk("pnfsd: I/O misaligned\n");
> +		goto out_layoutunavailable;
> +	}
> +
> +	/*
> +	 * Some clients barf on non-zero block numbers for NONE or INVALID
> +	 * layouts, so make sure to zero the whole structure.
> +	 */
> +	error = -ENOMEM;
> +	bex = kzalloc(sizeof(*bex), GFP_KERNEL);
> +	if (!bex)
> +		goto out_error;

bex is allocated.

> +	args->lg_content = bex;
> +
> +	error = sb->s_export_op->map_blocks(inode, seg->offset, seg->length,
> +					    &iomap, seg->iomode != IOMODE_READ,
> +					    &device_generation);
> +	if (error) {
> +		if (error == -ENXIO)
> +			goto out_layoutunavailable;
> +		goto out_error;
> +	}
> +
> +	if (iomap.length < args->lg_minlength) {
> +		dprintk("pnfsd: extent smaller than minlength\n");
> +		goto out_layoutunavailable;
> +	}
> +
> +	switch (iomap.type) {
> +	case IOMAP_MAPPED:
> +		if (seg->iomode == IOMODE_READ)
> +			bex->es = PNFS_BLOCK_READ_DATA;
> +		else
> +			bex->es = PNFS_BLOCK_READWRITE_DATA;
> +		bex->soff = (iomap.blkno << 9);
> +		break;
> +	case IOMAP_UNWRITTEN:
> +		if (seg->iomode & IOMODE_RW) {
> +			/*
> +			 * Crack monkey special case from section 2.3.1.
> +			 */
> +			if (args->lg_minlength == 0) {
> +				dprintk("pnfsd: no soup for you!\n");
> +				goto out_layoutunavailable;
> +			}
> +
> +			bex->es = PNFS_BLOCK_INVALID_DATA;
> +			bex->soff = (iomap.blkno << 9);
> +			break;
> +		}
> +		/*FALLTHRU*/
> +	case IOMAP_HOLE:
> +		if (seg->iomode == IOMODE_READ) {
> +			bex->es = PNFS_BLOCK_NONE_DATA;
> +			break;
> +		}
> +		/*FALLTHRU*/
> +	case IOMAP_DELALLOC:
> +	default:
> +		WARN(1, "pnfsd: filesystem returned %d extent\n", iomap.type);
> +		goto out_layoutunavailable;
> +	}
> +
> +	error = nfsd4_set_deviceid(&bex->vol_id, fhp, device_generation);
> +	if (error)
> +		goto out_error;
> +	bex->foff = iomap.offset;
> +	bex->len = iomap.length;
> +
> +	seg->offset = iomap.offset;
> +	seg->length = iomap.length;
> +
> +	args->lg_roc = 1;
> +
> +	dprintk("GET: %lld:%lld %d\n", bex->foff, bex->len, bex->es);
> +	return 0;
> +
> +out_error:
> +	seg->length = 0;
> +	return nfserrno(error);
> +out_layoutunavailable:
> +	seg->length = 0;
> +	return nfserr_layoutunavailable;

What reclaims bex in both error cases??

The call flow seems to be:

nfsd4_proc_compound -> nfsd4_layoutget -> nfsd4_block_proc_layoutget

lg_content gets freed in nfsd4_encode_layoutget() in all paths.

nfsd4_encode_operation() calls nfsd4_encode_layoutget().

But nfsd4_encode_layoutget() is not called in all paths:

        p = xdr_reserve_space(xdr, 8);
        if (!p) {
                WARN_ON_ONCE(1);
                return;  // leak
        }
...
        if (op->opnum == OP_ILLEGAL)
                goto status;  // Not really a leak, if we hit this, bigger issues apply.

So bex is correctly accounted for, but in general
nfsd4_encode_operation() can leak any operation
specific memory.

> +}
> +
> +static __be32
> +nfsd4_block_proc_layoutcommit(struct inode *inode,
> +		struct nfsd4_layoutcommit *lcp)
> +{
> +	loff_t new_size = lcp->lc_last_wr + 1;
> +	struct iattr iattr = { .ia_valid = 0 };
> +	struct iomap *iomaps;
> +	int nr_iomaps;
> +	int error;
> +
> +	nr_iomaps = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout,
> +			lcp->lc_up_len, &iomaps, 1 << inode->i_blkbits);
> +	if (nr_iomaps < 0)
> +		return nfserrno(nr_iomaps);
> +
> +	if (lcp->lc_mtime.tv_nsec == UTIME_NOW)
> +		lcp->lc_mtime = current_fs_time(inode->i_sb);
> +	if (timespec_compare(&lcp->lc_mtime, &inode->i_mtime) > 0) {
> +		iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME;
> +		iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime =
> +				lcp->lc_mtime;
> +	}
> +
> +	if (new_size > i_size_read(inode)) {
> +		iattr.ia_valid |= ATTR_SIZE;
> +		iattr.ia_size = new_size;
> +	}
> +
> +	error = inode->i_sb->s_export_op->commit_blocks(inode, iomaps,
> +			nr_iomaps, &iattr);
> +	kfree(iomaps);
> +	return nfserrno(error);
> +}
> +
> +const struct nfsd4_layout_ops bl_layout_ops = {
> +	.proc_getdeviceinfo	= nfsd4_block_proc_getdeviceinfo,
> +	.encode_getdeviceinfo	= nfsd4_block_encode_getdeviceinfo,
> +	.proc_layoutget		= nfsd4_block_proc_layoutget,
> +	.encode_layoutget	= nfsd4_block_encode_layoutget,
> +	.proc_layoutcommit	= nfsd4_block_proc_layoutcommit,
> +};
> diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
> new file mode 100644
> index 0000000..9da89fd
> --- /dev/null
> +++ b/fs/nfsd/blocklayoutxdr.c
> @@ -0,0 +1,157 @@
> +/*
> + * Copyright (c) 2014 Christoph Hellwig.
> + */
> +#include <linux/sunrpc/svc.h>
> +#include <linux/exportfs.h>
> +#include <linux/nfs4.h>
> +
> +#include "nfsd.h"
> +#include "blocklayoutxdr.h"
> +
> +#define NFSDDBG_FACILITY	NFSDDBG_PNFS
> +
> +
> +__be32
> +nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
> +		struct nfsd4_layoutget *lgp)
> +{
> +	struct pnfs_block_extent *b = lgp->lg_content;
> +	int len = sizeof(__be32) + 5 * sizeof(__be64) + sizeof(__be32);
> +	__be32 *p;
> +
> +	p = xdr_reserve_space(xdr, sizeof(__be32) + len);
> +	if (!p)
> +		return nfserr_toosmall;
> +
> +	*p++ = cpu_to_be32(len);
> +	*p++ = cpu_to_be32(1);		/* we always return a single extent */
> +
> +	p = xdr_encode_opaque_fixed(p, &b->vol_id,
> +			sizeof(struct nfsd4_deviceid));
> +	p = xdr_encode_hyper(p, b->foff);
> +	p = xdr_encode_hyper(p, b->len);
> +	p = xdr_encode_hyper(p, b->soff);
> +	*p++ = cpu_to_be32(b->es);
> +	return 0;
> +}
> +
> +static int
> +nfsd4_block_encode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
> +{
> +	__be32 *p;
> +	int len;
> +
> +	switch (b->type) {
> +	case PNFS_BLOCK_VOLUME_SIMPLE:
> +		len = 4 + 4 + 8 + 4 + b->simple.sig_len;
> +		p = xdr_reserve_space(xdr, len);
> +		if (!p)
> +			return -ETOOSMALL;
> +
> +		*p++ = cpu_to_be32(b->type);
> +		*p++ = cpu_to_be32(1);	/* single signature */
> +		p = xdr_encode_hyper(p, b->simple.offset);
> +		p = xdr_encode_opaque(p, b->simple.sig, b->simple.sig_len);
> +		break;
> +	default:
> +		return -ENOTSUPP;
> +	}
> +
> +	return len;
> +}
> +
> +__be32
> +nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
> +		struct nfsd4_getdeviceinfo *gdp)
> +{
> +	struct pnfs_block_deviceaddr *dev = gdp->gd_device;
> +	int len = sizeof(__be32), ret, i;
> +	__be32 *p;
> +
> +	p = xdr_reserve_space(xdr, len + sizeof(__be32));
> +	if (!p)
> +		return nfserr_resource;
> +
> +	for (i = 0; i < dev->nr_volumes; i++) {
> +		ret = nfsd4_block_encode_volume(xdr, &dev->volumes[i]);
> +		if (ret < 0)
> +			return nfserrno(ret);
> +		len += ret;
> +	}
> +
> +	/*
> +	 * Fill in the overall length and number of volumes at the beginning
> +	 * of the layout.
> +	 */
> +	*p++ = cpu_to_be32(len);
> +	*p++ = cpu_to_be32(dev->nr_volumes);
> +	return 0;
> +}
> +
> +int
> +nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
> +		u32 block_size)
> +{
> +	struct iomap *iomaps;
> +	u32 nr_iomaps, expected, i;
> +
> +	if (len < sizeof(u32)) {
> +		dprintk("%s: extent array too small: %u\n", __func__, len);
> +		return -EINVAL;
> +	}
> +
> +	nr_iomaps = be32_to_cpup(p++);
> +	expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE;
> +	if (len != expected) {
> +		dprintk("%s: extent array size mismatch: %u/%u\n",
> +			__func__, len, expected);
> +		return -EINVAL;
> +	}
> +
> +	iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL);
> +	if (!iomaps) {
> +		dprintk("%s: failed to allocate extent array\n", __func__);
> +		return -ENOMEM;
> +	}
> +
> +	for (i = 0; i < nr_iomaps; i++) {
> +		struct pnfs_block_extent bex;
> +
> +		memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid));
> +		p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid));
> +
> +		p = xdr_decode_hyper(p, &bex.foff);
> +		if (bex.foff & (block_size - 1)) {
> +			dprintk("%s: unaligned offset %lld\n",
> +				__func__, bex.foff);
> +			goto fail;
> +		}
> +		p = xdr_decode_hyper(p, &bex.len);
> +		if (bex.len & (block_size - 1)) {
> +			dprintk("%s: unaligned length %lld\n",
> +				__func__, bex.foff);
> +			goto fail;
> +		}
> +		p = xdr_decode_hyper(p, &bex.soff);
> +		if (bex.soff & (block_size - 1)) {
> +			dprintk("%s: unaligned disk offset %lld\n",
> +				__func__, bex.soff);
> +			goto fail;
> +		}
> +		bex.es = be32_to_cpup(p++);
> +		if (bex.es != PNFS_BLOCK_READWRITE_DATA) {
> +			dprintk("%s: incorrect extent state %d\n",
> +				__func__, bex.es);
> +			goto fail;
> +		}
> +
> +		iomaps[i].offset = bex.foff;
> +		iomaps[i].length = bex.len;
> +	}
> +
> +	*iomapp = iomaps;
> +	return nr_iomaps;
> +fail:
> +	kfree(iomaps);
> +	return -EINVAL;
> +}
> diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
> new file mode 100644
> index 0000000..fdc7903
> --- /dev/null
> +++ b/fs/nfsd/blocklayoutxdr.h
> @@ -0,0 +1,62 @@
> +#ifndef _NFSD_BLOCKLAYOUTXDR_H
> +#define _NFSD_BLOCKLAYOUTXDR_H 1
> +
> +#include <linux/blkdev.h>
> +#include "xdr4.h"
> +
> +struct iomap;
> +struct xdr_stream;
> +
> +enum pnfs_block_extent_state {
> +	PNFS_BLOCK_READWRITE_DATA	= 0,
> +	PNFS_BLOCK_READ_DATA		= 1,
> +	PNFS_BLOCK_INVALID_DATA		= 2,
> +	PNFS_BLOCK_NONE_DATA		= 3,
> +};
> +
> +struct pnfs_block_extent {
> +	struct nfsd4_deviceid		vol_id;
> +	u64				foff;
> +	u64				len;
> +	u64				soff;
> +	enum pnfs_block_extent_state	es;
> +};
> +#define NFS4_BLOCK_EXTENT_SIZE		44
> +
> +enum pnfs_block_volume_type {
> +	PNFS_BLOCK_VOLUME_SIMPLE	= 0,
> +	PNFS_BLOCK_VOLUME_SLICE		= 1,
> +	PNFS_BLOCK_VOLUME_CONCAT	= 2,
> +	PNFS_BLOCK_VOLUME_STRIPE	= 3,
> +};
> +
> +/*
> + * Random upper cap for the uuid length to avoid unbounded allocation.
> + * Not actually limited by the protocol.
> + */
> +#define PNFS_BLOCK_UUID_LEN	128
> +
> +struct pnfs_block_volume {
> +	enum pnfs_block_volume_type	type;
> +	union {
> +		struct {
> +			u64		offset;
> +			u32		sig_len;
> +			u8		sig[PNFS_BLOCK_UUID_LEN];
> +		} simple;
> +	};
> +};
> +
> +struct pnfs_block_deviceaddr {
> +	u32				nr_volumes;
> +	struct pnfs_block_volume	volumes[];
> +};
> +
> +__be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr,
> +		struct nfsd4_getdeviceinfo *gdp);
> +__be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr,
> +		struct nfsd4_layoutget *lgp);
> +int nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp,
> +		u32 block_size);
> +
> +#endif /* _NFSD_BLOCKLAYOUTXDR_H */
> diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
> index bb91981..8353b7a 100644
> --- a/fs/nfsd/nfs4layouts.c
> +++ b/fs/nfsd/nfs4layouts.c
> @@ -26,6 +26,7 @@ static struct nfsd4_callback_ops nfsd4_cb_layout_ops;
>  static const struct lock_manager_operations nfsd4_layouts_lm_ops;
>  
>  const struct nfsd4_layout_ops *nfsd4_layout_ops[LAYOUT_TYPE_MAX] =  {
> +	[LAYOUT_BLOCK_VOLUME]	= &bl_layout_ops,
>  };
>  
>  /* pNFS device ID to export fsid mapping */
> @@ -116,6 +117,12 @@ nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp,
>  
>  void nfsd4_setup_layout_type(struct svc_export *exp)
>  {
> +	struct super_block *sb = exp->ex_path.mnt->mnt_sb;
> +
> +	if (sb->s_export_op->get_uuid &&
> +	    sb->s_export_op->map_blocks &&
> +	    sb->s_export_op->commit_blocks)
> +		exp->ex_layout_type = LAYOUT_BLOCK_VOLUME;
>  }
>  
>  static void
> diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
> index fa37117..d6d94e1 100644
> --- a/fs/nfsd/pnfs.h
> +++ b/fs/nfsd/pnfs.h
> @@ -34,6 +34,7 @@ struct nfsd4_layout_ops {
>  };
>  
>  extern const struct nfsd4_layout_ops *nfsd4_layout_ops[];
> +extern const struct nfsd4_layout_ops bl_layout_ops;
>  
>  __be32 nfsd4_preprocess_layout_stateid(struct svc_rqst *rqstp,
>  		struct nfsd4_compound_state *cstate, stateid_t *stateid,
> -- 
> 1.9.1
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

_______________________________________________
xfs mailing list
xfs@xxxxxxxxxxx
http://oss.sgi.com/mailman/listinfo/xfs