Re: [PATCH 5/6] pnfs/blocklayout: in-kernel GETDEVICEINFO XDR parsing

Anna Schumaker <schumaker.anna@xxxxxxxxx> · Wed, 3 Sep 2014 15:31:07 -0400

Hey Christoph,

On 09/03/2014 12:38 AM, Christoph Hellwig wrote:
> This patches moves parsing of the GETDEVICEINFO XDR to kernel space, as well
> as the management of complex devices.  The reason for that is we might have
> multiple outstanding complex devices after a NOTIFY_DEVICEID4_CHANGE, which
> device mapper or md can't handle as they claim devices exclusively.
>
> But as is turns out simple striping / concatenation is fairly trivial to
> implement anyway, so we make our life simpler by reducing the reliance
> on blkmapd.  For now we still use blkmapd by feeding it synthetic SIMPLE
> device XDR to translate device signatures to device numbers, but in the
> long runs I have plans to eliminate it entirely.
>
> Signed-off-by: Christoph Hellwig <hch@xxxxxx>
> ---
>  fs/nfs/blocklayout/Makefile      |   2 +-
>  fs/nfs/blocklayout/blocklayout.c |  92 ++++++----
>  fs/nfs/blocklayout/blocklayout.h |  81 ++++++++-
>  fs/nfs/blocklayout/dev.c         | 358 +++++++++++++++++++++++++++++++++++++++
>  fs/nfs/blocklayout/rpc_pipefs.c  | 141 ++++-----------
>  5 files changed, 526 insertions(+), 148 deletions(-)
>  create mode 100644 fs/nfs/blocklayout/dev.c
>
> diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile
> index e177026..3ca14c3 100644
> --- a/fs/nfs/blocklayout/Makefile
> +++ b/fs/nfs/blocklayout/Makefile
> @@ -3,4 +3,4 @@
>  #
>  obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o
>  
> -blocklayoutdriver-y += blocklayout.o extent_tree.o rpc_pipefs.o
> +blocklayoutdriver-y += blocklayout.o dev.o extent_tree.o rpc_pipefs.o
> diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
> index 7b3c8c9..e92591c 100644
> --- a/fs/nfs/blocklayout/blocklayout.c
> +++ b/fs/nfs/blocklayout/blocklayout.c
> @@ -114,13 +114,10 @@ bl_submit_bio(int rw, struct bio *bio)
>  	return NULL;
>  }
>  
> -static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
> -				     struct pnfs_block_extent *be,
> -				     void (*end_io)(struct bio *, int err),
> -				     struct parallel_io *par)
> +static struct bio *
> +bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector,
> +		void (*end_io)(struct bio *, int err), struct parallel_io *par)
>  {
> -	struct pnfs_block_dev *dev =
> -		container_of(be->be_device, struct pnfs_block_dev, d_node);
>  	struct bio *bio;
>  
>  	npg = min(npg, BIO_MAX_PAGES);
> @@ -131,32 +128,55 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect,
>  	}
>  
>  	if (bio) {
> -		bio->bi_iter.bi_sector = isect - be->be_f_offset +
> -			be->be_v_offset;
> -		bio->bi_bdev = dev->d_bdev;
> +		bio->bi_iter.bi_sector = disk_sector;
> +		bio->bi_bdev = bdev;
>  		bio->bi_end_io = end_io;
>  		bio->bi_private = par;
>  	}
>  	return bio;
>  }
>  
> -static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw,
> -				      sector_t isect, struct page *page,
> -				      struct pnfs_block_extent *be,
> -				      void (*end_io)(struct bio *, int err),
> -				      struct parallel_io *par,
> -				      unsigned int offset, int len)
> +static struct bio *
> +do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect,
> +		struct page *page, struct pnfs_block_dev_map *map,
> +		struct pnfs_block_extent *be,
> +		void (*end_io)(struct bio *, int err),
> +		struct parallel_io *par, unsigned int offset, int *len)
>  {
> -	isect = isect + (offset >> SECTOR_SHIFT);
> +	struct pnfs_block_dev *dev =
> +		container_of(be->be_device, struct pnfs_block_dev, node);
> +	u64 disk_addr, end;
> +
>  	dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__,
> -		npg, rw, (unsigned long long)isect, offset, len);
> +		npg, rw, (unsigned long long)isect, offset, *len);
> +
> +	/* translate to device offset */
> +	isect += be->be_v_offset;
> +	isect -= be->be_f_offset;
> +
> +	/* translate to physical disk offset */
> +	disk_addr = (u64)isect << SECTOR_SHIFT;
> +	if (disk_addr < map->start || disk_addr >= map->start + map->len) {
> +		if (!dev->map(dev, disk_addr, map))
> +			return ERR_PTR(-EIO);
> +		bio = bl_submit_bio(rw, bio);
> +	}
> +	disk_addr += map->disk_offset;
> +	disk_addr -= map->start;
> +
> +	/* limit length to what the device mapping allows */
> +	end = disk_addr + *len;
> +	if (end >= map->start + map->len)
> +		*len = map->start + map->len - disk_addr;
> +
>  retry:
>  	if (!bio) {
> -		bio = bl_alloc_init_bio(npg, isect, be, end_io, par);
> +		bio = bl_alloc_init_bio(npg, map->bdev,
> +				disk_addr >> SECTOR_SHIFT, end_io, par);
>  		if (!bio)
>  			return ERR_PTR(-ENOMEM);
>  	}
> -	if (bio_add_page(bio, page, len, offset) < len) {
> +	if (bio_add_page(bio, page, *len, offset) < *len) {
>  		bio = bl_submit_bio(rw, bio);
>  		goto retry;
>  	}
> @@ -203,6 +223,7 @@ static enum pnfs_try_status
>  bl_read_pagelist(struct nfs_pgio_header *header)
>  {
>  	struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
> +	struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
>  	struct bio *bio = NULL;
>  	struct pnfs_block_extent be;
>  	sector_t isect, extent_length = 0;
> @@ -248,28 +269,29 @@ bl_read_pagelist(struct nfs_pgio_header *header)
>  				pg_len = PAGE_CACHE_SIZE - pg_offset;
>  			else
>  				pg_len = bytes_left;
> -
> -			f_offset += pg_len;
> -			bytes_left -= pg_len;
> -			isect += (pg_offset >> SECTOR_SHIFT);
> -			extent_length -= (pg_offset >> SECTOR_SHIFT);
>  		} else {
>  			BUG_ON(pg_offset != 0);
>  			pg_len = PAGE_CACHE_SIZE;
>  		}
>  
> +		isect += (pg_offset >> SECTOR_SHIFT);
> +		extent_length -= (pg_offset >> SECTOR_SHIFT);
> +
>  		if (is_hole(&be)) {
>  			bio = bl_submit_bio(READ, bio);
>  			/* Fill hole w/ zeroes w/o accessing device */
>  			dprintk("%s Zeroing page for hole\n", __func__);
>  			zero_user_segment(pages[i], pg_offset, pg_len);
> +
> +			/* invalidate map */
> +			map.start = NFS4_MAX_UINT64;
>  		} else {
>  			bio = do_add_page_to_bio(bio,
>  						 header->page_array.npages - i,
>  						 READ,
> -						 isect, pages[i], &be,
> +						 isect, pages[i], &map, &be,
>  						 bl_end_io_read, par,
> -						 pg_offset, pg_len);
> +						 pg_offset, &pg_len);
>  			if (IS_ERR(bio)) {
>  				header->pnfs_error = PTR_ERR(bio);
>  				bio = NULL;
> @@ -278,6 +300,8 @@ bl_read_pagelist(struct nfs_pgio_header *header)
>  		}
>  		isect += (pg_len >> SECTOR_SHIFT);
>  		extent_length -= (pg_len >> SECTOR_SHIFT);
> +		f_offset += pg_len;
> +		bytes_left -= pg_len;
>  	}
>  	if ((isect << SECTOR_SHIFT) >= header->inode->i_size) {
>  		header->res.eof = 1;
> @@ -346,6 +370,7 @@ static enum pnfs_try_status
>  bl_write_pagelist(struct nfs_pgio_header *header, int sync)
>  {
>  	struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg);
> +	struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 };
>  	struct bio *bio = NULL;
>  	struct pnfs_block_extent be;
>  	sector_t isect, extent_length = 0;
> @@ -354,6 +379,7 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
>  	size_t count = header->args.count;
>  	struct page **pages = header->args.pages;
>  	int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT;
> +	unsigned int pg_len;
>  	struct blk_plug plug;
>  	int i;
>  
> @@ -387,19 +413,21 @@ bl_write_pagelist(struct nfs_pgio_header *header, int sync)
>  			extent_length = be.be_length - (isect - be.be_f_offset);
>  		}
>  
> +		pg_len = PAGE_CACHE_SIZE;
>  		bio = do_add_page_to_bio(bio, header->page_array.npages - i,
> -					 WRITE, isect, pages[i], &be,
> +					 WRITE, isect, pages[i], &map, &be,
>  					 bl_end_io_write, par,
> -					 0, PAGE_CACHE_SIZE);
> +					 0, &pg_len);
>  		if (IS_ERR(bio)) {
>  			header->pnfs_error = PTR_ERR(bio);
>  			bio = NULL;
>  			goto out;
>  		}
> -		offset += PAGE_CACHE_SIZE;
> -		count -= PAGE_CACHE_SIZE;
> -		isect += PAGE_CACHE_SECTORS;
> -		extent_length -= PAGE_CACHE_SECTORS;
> +
> +		offset += pg_len;
> +		count -= pg_len;
> +		isect += (pg_len >> SECTOR_SHIFT);
> +		extent_length -= (pg_len >> SECTOR_SHIFT);
>  	}
>  
>  	header->res.count = header->args.count;
> diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
> index c98d98a..3077391 100644
> --- a/fs/nfs/blocklayout/blocklayout.h
> +++ b/fs/nfs/blocklayout/blocklayout.h
> @@ -44,9 +44,75 @@
>  #define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT)
>  #define SECTOR_SIZE (1 << SECTOR_SHIFT)
>  
> +struct pnfs_block_dev;
> +
> +enum pnfs_block_volume_type {
> +	PNFS_BLOCK_VOLUME_SIMPLE	= 0,
> +	PNFS_BLOCK_VOLUME_SLICE		= 1,
> +	PNFS_BLOCK_VOLUME_CONCAT	= 2,
> +	PNFS_BLOCK_VOLUME_STRIPE	= 3,
> +};
> +
> +#define PNFS_BLOCK_MAX_UUIDS	4
> +
> +/*
> + * Random upper cap for the uuid length to avoid unbounded allocation.
> + * Not actually limited by the protocol.
> + */
> +#define PNFS_BLOCK_UUID_LEN	128
> +
> +struct pnfs_block_volume {
> +	enum pnfs_block_volume_type	type;
> +	union {
> +		struct {
> +			int		len;
> +			int		nr_sigs;
> +			struct {
> +				u64		offset;
> +				u32		sig_len;
> +				u8		sig[PNFS_BLOCK_UUID_LEN];
> +			} sigs[PNFS_BLOCK_MAX_UUIDS];
> +		} simple;
> +		struct {
> +			u64		start;
> +			u64		len;
> +			u32		volume;
> +		} slice;
> +		struct {
> +			u32		volumes_count;
> +			u32		volumes[MAX_RAID_DEVICES];
> +		} concat;
> +		struct {
> +			u64		chunk_size;
> +			u32		volumes_count;
> +			u32		volumes[MAX_RAID_DEVICES];
> +		} stripe;
> +	};
> +};
> +
> +struct pnfs_block_dev_map {
> +	sector_t			start;
> +	sector_t			len;
> +
> +	sector_t			disk_offset;
> +	struct block_device		*bdev;
> +};
> +
>  struct pnfs_block_dev {
> -	struct nfs4_deviceid_node	d_node;
> -	struct block_device		*d_bdev;
> +	struct nfs4_deviceid_node	node;
> +
> +	u64				start;
> +	u64				len;
> +
> +	u32				nr_children;
> +	struct pnfs_block_dev		*children;
> +	u64				chunk_size;
> +
> +	struct block_device		*bdev;
> +	u64				disk_offset;
> +
> +	bool (*map)(struct pnfs_block_dev *dev, u64 offset,
> +			struct pnfs_block_dev_map *map);
>  };
>  
>  enum exstate4 {
> @@ -110,6 +176,11 @@ struct bl_msg_hdr {
>  #define BL_DEVICE_REQUEST_PROC         0x1 /* User level process succeeds */
>  #define BL_DEVICE_REQUEST_ERR          0x2 /* User level process fails */
>  
> +/* dev.c */
> +struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
> +		struct pnfs_device *pdev, gfp_t gfp_mask);
> +void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
> +
>  /* extent_tree.c */
>  int ext_tree_insert(struct pnfs_block_layout *bl,
>  		struct pnfs_block_extent *new);
> @@ -123,10 +194,8 @@ int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg);
>  void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status);
>  
>  /* rpc_pipefs.c */
> -struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server,
> -		struct pnfs_device *pdev, gfp_t gfp_mask);
> -void bl_free_deviceid_node(struct nfs4_deviceid_node *d);
> -
> +dev_t bl_resolve_deviceid(struct nfs_server *server,
> +		struct pnfs_block_volume *b, gfp_t gfp_mask);
>  int __init bl_init_pipefs(void);
>  void __exit bl_cleanup_pipefs(void);
>  
> diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
> new file mode 100644
> index 0000000..ae18f80
> --- /dev/null
> +++ b/fs/nfs/blocklayout/dev.c
> @@ -0,0 +1,358 @@
> +/*
> + * Copyright (c) 2014 Christoph Hellwig.
> + */
> +#include <linux/sunrpc/svc.h>
> +#include <linux/blkdev.h>
> +#include <linux/nfs4.h>
> +#include <linux/nfs_fs.h>
> +#include <linux/nfs_xdr.h>
> +
> +#include "blocklayout.h"
> +
> +static void
> +bl_free_device(struct pnfs_block_dev *dev)
> +{
> +	if (dev->nr_children) {
> +		int i;
> +
> +		for (i = 0; i < dev->nr_children; i++)
> +			bl_free_device(&dev->children[i]);
> +		kfree(dev->children);
> +	} else {
> +		if (dev->bdev)
> +			blkdev_put(dev->bdev, FMODE_READ);

else if (dev->bdev)? :)

> +	}
> +}
> +
> +void
> +bl_free_deviceid_node(struct nfs4_deviceid_node *d)
> +{
> +	struct pnfs_block_dev *dev =
> +		container_of(d, struct pnfs_block_dev, node);
> +
> +	bl_free_device(dev);
> +	kfree(dev);
> +}
> +
> +static int
> +nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b)
> +{
> +	__be32 *p;
> +	int i;
> +
> +	p = xdr_inline_decode(xdr, 4);
> +	if (!p)
> +		return -EIO;
> +	b->type = be32_to_cpup(p++);
> +
> +	switch (b->type) {
> +	case PNFS_BLOCK_VOLUME_SIMPLE:
> +		p = xdr_inline_decode(xdr, 4);
> +		if (!p)
> +			return -EIO;
> +		b->simple.nr_sigs = be32_to_cpup(p++);
> +		if (!b->simple.nr_sigs) {
> +			dprintk("no signature\n");
> +			return -EIO;
> +		}
> +
> +		b->simple.len = 4 + 4;
> +		for (i = 0; i < b->simple.nr_sigs; i++) {
> +			p = xdr_inline_decode(xdr, 8 + 4);
> +			if (!p)
> +				return -EIO;
> +			p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
> +			b->simple.sigs[i].sig_len = be32_to_cpup(p++);
> +
> +			p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
> +			if (!p)
> +				return -EIO;
> +			memcpy(&b->simple.sigs[i].sig, p,
> +				b->simple.sigs[i].sig_len);
> +
> +			b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len;
> +		}
> +		break;
> +	case PNFS_BLOCK_VOLUME_SLICE:
> +		p = xdr_inline_decode(xdr, 8 + 8 + 4);
> +		if (!p)
> +			return -EIO;
> +		p = xdr_decode_hyper(p, &b->slice.start);
> +		p = xdr_decode_hyper(p, &b->slice.len);
> +		b->slice.volume = be32_to_cpup(p++);
> +		break;
> +	case PNFS_BLOCK_VOLUME_CONCAT:
> +		p = xdr_inline_decode(xdr, 4);
> +		if (!p)
> +			return -EIO;
> +		b->concat.volumes_count = be32_to_cpup(p++);
> +
> +		p = xdr_inline_decode(xdr, b->concat.volumes_count * 4);
> +		if (!p)
> +			return -EIO;
> +		for (i = 0; i < b->concat.volumes_count; i++)
> +			b->concat.volumes[i] = be32_to_cpup(p++);
> +		break;
> +	case PNFS_BLOCK_VOLUME_STRIPE:
> +		p = xdr_inline_decode(xdr, 8 + 4);
> +		if (!p)
> +			return -EIO;
> +		p = xdr_decode_hyper(p, &b->stripe.chunk_size);
> +		b->stripe.volumes_count = be32_to_cpup(p++);
> +
> +		p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4);
> +		if (!p)
> +			return -EIO;
> +		for (i = 0; i < b->stripe.volumes_count; i++)
> +			b->stripe.volumes[i] = be32_to_cpup(p++);
> +		break;
> +	default:
> +		dprintk("unknown volume type!\n");
> +		return -EIO;
> +	}

Can you make each of these cases a helper function?

> +
> +	return 0;
> +}
> +
> +static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset,
> +		struct pnfs_block_dev_map *map)
> +{
> +	map->start = dev->start;
> +	map->len = dev->len;
> +	map->disk_offset = dev->disk_offset;
> +	map->bdev = dev->bdev;
> +	return true;
> +}
> +
> +static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset,
> +		struct pnfs_block_dev_map *map)
> +{
> +	int i;
> +
> +	for (i = 0; i < dev->nr_children; i++) {
> +		struct pnfs_block_dev *child = &dev->children[i];
> +
> +		if (child->start > offset ||
> +		    child->start + child->len <= offset)
> +			continue;
> +
> +		child->map(child, offset - child->start, map);
> +		return true;
> +	}
> +
> +	dprintk("%s: ran off loop!\n", __func__);
> +	return false;
> +}
> +
> +static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset,
> +		struct pnfs_block_dev_map *map)
> +{
> +	struct pnfs_block_dev *child;
> +	u64 chunk = (offset / dev->chunk_size);
> +	int chunk_idx = chunk % dev->nr_children;
> +	u64 disk_offset;
> +
> +	if (chunk_idx > dev->nr_children) {
> +		dprintk("%s: invalid chunk idx %d (%lld/%lld)\n",
> +			__func__, chunk_idx, offset, dev->chunk_size);
> +		/* error, should not happen */
> +		return false;
> +	}
> +
> +	/* truncate offset to the beginning of the stripe */
> +	offset = chunk * dev->chunk_size;
> +
> +	/* disk offset of the stripe */
> +	disk_offset = offset / dev->nr_children;
> +
> +	child = &dev->children[chunk_idx];
> +	child->map(child, disk_offset, map);
> +
> +	map->start += offset;
> +	map->disk_offset += disk_offset;
> +	map->len = dev->chunk_size;
> +	return true;
> +}
> +
> +static int
> +bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
> +		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask);

Why put the declaration in the middle of the file?

Anna

> +
> +
> +static int
> +bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d,
> +		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
> +{
> +	struct pnfs_block_volume *v = &volumes[idx];
> +	dev_t dev;
> +
> +	dev = bl_resolve_deviceid(server, v, gfp_mask);
> +	if (!dev)
> +		return -EIO;
> +
> +	d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
> +	if (IS_ERR(d->bdev)) {
> +		printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
> +			MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
> +		return PTR_ERR(d->bdev);
> +	}
> +
> +
> +	d->len = i_size_read(d->bdev->bd_inode);
> +	d->map = bl_map_simple;
> +
> +	printk(KERN_INFO "pNFS: using block device %s\n",
> +		d->bdev->bd_disk->disk_name);
> +	return 0;
> +}
> +
> +static int
> +bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d,
> +		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
> +{
> +	struct pnfs_block_volume *v = &volumes[idx];
> +	int ret;
> +
> +	ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask);
> +	if (ret)
> +		return ret;
> +
> +	d->disk_offset = v->slice.start;
> +	d->len = v->slice.len;
> +	return 0;
> +}
> +
> +static int
> +bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d,
> +		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
> +{
> +	struct pnfs_block_volume *v = &volumes[idx];
> +	u64 len = 0;
> +	int ret, i;
> +
> +	d->children = kcalloc(v->concat.volumes_count,
> +			sizeof(struct pnfs_block_dev), GFP_KERNEL);
> +	if (!d->children)
> +		return -ENOMEM;
> +
> +	for (i = 0; i < v->concat.volumes_count; i++) {
> +		ret = bl_parse_deviceid(server, &d->children[i],
> +				volumes, v->concat.volumes[i], gfp_mask);
> +		if (ret)
> +			return ret;
> +
> +		d->nr_children++;
> +		d->children[i].start += len;
> +		len += d->children[i].len;
> +	}
> +
> +	d->len = len;
> +	d->map = bl_map_concat;
> +	return 0;
> +}
> +
> +static int
> +bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d,
> +		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
> +{
> +	struct pnfs_block_volume *v = &volumes[idx];
> +	u64 len = 0;
> +	int ret, i;
> +
> +	d->children = kcalloc(v->stripe.volumes_count,
> +			sizeof(struct pnfs_block_dev), GFP_KERNEL);
> +	if (!d->children)
> +		return -ENOMEM;
> +
> +	for (i = 0; i < v->stripe.volumes_count; i++) {
> +		ret = bl_parse_deviceid(server, &d->children[i],
> +				volumes, v->stripe.volumes[i], gfp_mask);
> +		if (ret)
> +			return ret;
> +
> +		d->nr_children++;
> +		len += d->children[i].len;
> +	}
> +
> +	d->len = len;
> +	d->chunk_size = v->stripe.chunk_size;
> +	d->map = bl_map_stripe;
> +	return 0;
> +}
> +
> +static int
> +bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d,
> +		struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask)
> +{
> +	switch (volumes[idx].type) {
> +	case PNFS_BLOCK_VOLUME_SIMPLE:
> +		return bl_parse_simple(server, d, volumes, idx, gfp_mask);
> +	case PNFS_BLOCK_VOLUME_SLICE:
> +		return bl_parse_slice(server, d, volumes, idx, gfp_mask);
> +	case PNFS_BLOCK_VOLUME_CONCAT:
> +		return bl_parse_concat(server, d, volumes, idx, gfp_mask);
> +	case PNFS_BLOCK_VOLUME_STRIPE:
> +		return bl_parse_stripe(server, d, volumes, idx, gfp_mask);
> +	default:
> +		dprintk("unsupported volume type: %d\n", volumes[idx].type);
> +		return -EIO;
> +	}
> +}
> +
> +struct nfs4_deviceid_node *
> +bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev,
> +		gfp_t gfp_mask)
> +{
> +	struct nfs4_deviceid_node *node = NULL;
> +	struct pnfs_block_volume *volumes;
> +	struct pnfs_block_dev *top;
> +	struct xdr_stream xdr;
> +	struct xdr_buf buf;
> +	struct page *scratch;
> +	int nr_volumes, ret, i;
> +	__be32 *p;
> +
> +	scratch = alloc_page(gfp_mask);
> +	if (!scratch)
> +		goto out;
> +
> +	xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen);
> +	xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE);
> +
> +	p = xdr_inline_decode(&xdr, sizeof(__be32));
> +	if (!p)
> +		goto out_free_scratch;
> +	nr_volumes = be32_to_cpup(p++);
> +
> +	volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume),
> +			  gfp_mask);
> +	if (!volumes)
> +		goto out_free_scratch;
> +
> +	for (i = 0; i < nr_volumes; i++) {
> +		ret = nfs4_block_decode_volume(&xdr, &volumes[i]);
> +		if (ret < 0)
> +			goto out_free_volumes;
> +	}
> +
> +	top = kzalloc(sizeof(*top), gfp_mask);
> +	if (!top)
> +		goto out_free_volumes;
> +
> +	ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask);
> +	if (ret) {
> +		bl_free_device(top);
> +		kfree(top);
> +		goto out_free_volumes;
> +	}
> +
> +	node = &top->node;
> +	nfs4_init_deviceid_node(node, server, &pdev->dev_id);
> +
> +out_free_volumes:
> +	kfree(volumes);
> +out_free_scratch:
> +	__free_page(scratch);
> +out:
> +	return node;
> +}
> diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c
> index bfb0486..8d04bda 100644
> --- a/fs/nfs/blocklayout/rpc_pipefs.c
> +++ b/fs/nfs/blocklayout/rpc_pipefs.c
> @@ -34,94 +34,53 @@
>  
>  #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
>  
> -static void bl_dm_remove(struct net *net, dev_t dev)
> +static void
> +nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b)
>  {
> -	struct bl_pipe_msg bl_pipe_msg;
> -	struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
> -	struct bl_dev_msg bl_umount_request;
> -	struct bl_msg_hdr bl_msg = {
> -		.type = BL_DEVICE_UMOUNT,
> -		.totallen = sizeof(bl_umount_request),
> -	};
> -	uint8_t *dataptr;
> -	DECLARE_WAITQUEUE(wq, current);
> -	struct nfs_net *nn = net_generic(net, nfs_net_id);
> -
> -	dprintk("Entering %s\n", __func__);
> -
> -	bl_pipe_msg.bl_wq = &nn->bl_wq;
> -	memset(msg, 0, sizeof(*msg));
> -	msg->len = sizeof(bl_msg) + bl_msg.totallen;
> -	msg->data = kzalloc(msg->len, GFP_NOFS);
> -	if (!msg->data)
> -		goto out;
> -
> -	memset(&bl_umount_request, 0, sizeof(bl_umount_request));
> -	bl_umount_request.major = MAJOR(dev);
> -	bl_umount_request.minor = MINOR(dev);
> -
> -	memcpy(msg->data, &bl_msg, sizeof(bl_msg));
> -	dataptr = (uint8_t *) msg->data;
> -	memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request));
> -
> -	add_wait_queue(&nn->bl_wq, &wq);
> -	if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) {
> -		remove_wait_queue(&nn->bl_wq, &wq);
> -		goto out;
> +	int i;
> +
> +	*p++ = cpu_to_be32(1);
> +	*p++ = cpu_to_be32(b->type);
> +	*p++ = cpu_to_be32(b->simple.nr_sigs);
> +	for (i = 0; i < b->simple.nr_sigs; i++) {
> +		p = xdr_encode_hyper(p, b->simple.sigs[i].offset);
> +		p = xdr_encode_opaque(p, b->simple.sigs[i].sig,
> +					 b->simple.sigs[i].sig_len);
>  	}
> -
> -	set_current_state(TASK_UNINTERRUPTIBLE);
> -	schedule();
> -	__set_current_state(TASK_RUNNING);
> -	remove_wait_queue(&nn->bl_wq, &wq);
> -
> -out:
> -	kfree(msg->data);
>  }
>  
> -/*
> - * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf.
> - */
> -struct nfs4_deviceid_node *
> -bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *dev,
> +dev_t
> +bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b,
>  		gfp_t gfp_mask)
>  {
> -	struct pnfs_block_dev *rv;
> -	struct block_device *bd;
> -	struct bl_pipe_msg bl_pipe_msg;
> -	struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
> -	struct bl_msg_hdr bl_msg = {
> -		.type = BL_DEVICE_MOUNT,
> -		.totallen = dev->mincount,
> -	};
> -	uint8_t *dataptr;
> -	DECLARE_WAITQUEUE(wq, current);
> -	int offset, len, i, rc;
>  	struct net *net = server->nfs_client->cl_net;
>  	struct nfs_net *nn = net_generic(net, nfs_net_id);
>  	struct bl_dev_msg *reply = &nn->bl_mount_reply;
> +	struct bl_pipe_msg bl_pipe_msg;
> +	struct rpc_pipe_msg *msg = &bl_pipe_msg.msg;
> +	struct bl_msg_hdr *bl_msg;
> +	DECLARE_WAITQUEUE(wq, current);
> +	dev_t dev = 0;
> +	int rc;
>  
>  	dprintk("%s CREATING PIPEFS MESSAGE\n", __func__);
> -	dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data,
> -		dev->mincount);
>  
>  	bl_pipe_msg.bl_wq = &nn->bl_wq;
> +
> +	b->simple.len += 4;	/* single volume */
> +	if (b->simple.len > PAGE_SIZE)
> +		return -EIO;
> +
>  	memset(msg, 0, sizeof(*msg));
> -	msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, gfp_mask);
> +	msg->len = sizeof(*bl_msg) + b->simple.len;
> +	msg->data = kzalloc(msg->len, gfp_mask);
>  	if (!msg->data)
>  		goto out;
>  
> -	memcpy(msg->data, &bl_msg, sizeof(bl_msg));
> -	dataptr = (uint8_t *) msg->data;
> -	len = dev->mincount;
> -	offset = sizeof(bl_msg);
> -	for (i = 0; len > 0; i++) {
> -		memcpy(&dataptr[offset], page_address(dev->pages[i]),
> -				len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE);
> -		len -= PAGE_CACHE_SIZE;
> -		offset += PAGE_CACHE_SIZE;
> -	}
> -	msg->len = sizeof(bl_msg) + dev->mincount;
> +	bl_msg = msg->data;
> +	bl_msg->type = BL_DEVICE_MOUNT,
> +	bl_msg->totallen = b->simple.len;
> +	nfs4_encode_simple(msg->data + sizeof(*bl_msg), b);
>  
>  	dprintk("%s CALLING USERSPACE DAEMON\n", __func__);
>  	add_wait_queue(&nn->bl_wq, &wq);
> @@ -142,46 +101,10 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *dev,
>  		goto out;
>  	}
>  
> -	bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor),
> -			       FMODE_READ, NULL);
> -	if (IS_ERR(bd)) {
> -		printk(KERN_WARNING "%s failed to open device %d:%d (%ld)\n",
> -			__func__, reply->major, reply->minor,
> -			PTR_ERR(bd));
> -		goto out;
> -	}
> -
> -	rv = kzalloc(sizeof(*rv), gfp_mask);
> -	if (!rv)
> -		goto out;
> -
> -	nfs4_init_deviceid_node(&rv->d_node, server, &dev->dev_id);
> -	rv->d_bdev = bd;
> -
> -	dprintk("%s Created device %s with bd_block_size %u\n",
> -		__func__,
> -		bd->bd_disk->disk_name,
> -		bd->bd_block_size);
> -
> -	kfree(msg->data);
> -	return &rv->d_node;
> -
> +	dev = MKDEV(reply->major, reply->minor);
>  out:
>  	kfree(msg->data);
> -	return NULL;
> -}
> -
> -void
> -bl_free_deviceid_node(struct nfs4_deviceid_node *d)
> -{
> -	struct pnfs_block_dev *dev =
> -		container_of(d, struct pnfs_block_dev, d_node);
> -	struct net *net = d->nfs_client->cl_net;
> -
> -	blkdev_put(dev->d_bdev, FMODE_READ);
> -	bl_dm_remove(net, dev->d_bdev->bd_dev);
> -
> -	kfree(dev);
> +	return dev;
>  }
>  
>  static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src,

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html