[PATCH] review-only: blkmapd: Add complex block layout discovery and mapping daemon

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



For review only.  Do not apply.

This daemon is required to handle upcalls from the kernel pnfs block layout
driver.

This is what's in Benny's tree now.

This patch is missing the man page, which I will add before submitting.  But
I'd like to get feedback on the general shape of the code, and whether this
one big patch should be split up.

Signed-off-by: Jim Rees <rees@xxxxxxxxx>
---
 .gitignore                           |    1 +
 configure.ac                         |    4 +
 utils/Makefile.am                    |    4 +
 utils/blkmapd/Makefile.am            |   19 ++
 utils/blkmapd/device-discovery.c     |  453 +++++++++++++++++++++++++++++
 utils/blkmapd/device-discovery.h     |  162 +++++++++++
 utils/blkmapd/device-inq.c           |  233 +++++++++++++++
 utils/blkmapd/device-process.c       |  407 ++++++++++++++++++++++++++
 utils/blkmapd/dm-device.c            |  518 ++++++++++++++++++++++++++++++++++
 utils/blkmapd/etc/initd/initd.redhat |   76 +++++
 10 files changed, 1877 insertions(+), 0 deletions(-)
 create mode 100644 utils/blkmapd/Makefile.am
 create mode 100644 utils/blkmapd/device-discovery.c
 create mode 100644 utils/blkmapd/device-discovery.h
 create mode 100644 utils/blkmapd/device-inq.c
 create mode 100644 utils/blkmapd/device-process.c
 create mode 100644 utils/blkmapd/dm-device.c
 create mode 100644 utils/blkmapd/etc/initd/initd.redhat

diff --git a/.gitignore b/.gitignore
index f5b5cf0..7bd9921 100644
--- a/.gitignore
+++ b/.gitignore
@@ -36,6 +36,7 @@ support/include/stamp-h1
 lib*.a
 tools/rpcgen/rpcgen
 tools/rpcdebug/rpcdebug
+utils/blkmapd/blkmapd
 utils/exportfs/exportfs
 utils/idmapd/idmapd
 utils/lockd/lockd
diff --git a/configure.ac b/configure.ac
index c9fb27b..08ef029 100644
--- a/configure.ac
+++ b/configure.ac
@@ -64,11 +64,14 @@ AC_ARG_ENABLE(nfsv4,
 	enable_nfsv4=yes)
 	if test "$enable_nfsv4" = yes; then
 		AC_DEFINE(NFS4_SUPPORTED, 1, [Define this if you want NFSv4 support compiled in])
+		BLKMAPD=blkmapd
 		IDMAPD=idmapd
 	else
 		enable_nfsv4=
+		BLKMAPD=
 		IDMAPD=
 	fi
+	AC_SUBST(BLKMAPD)
 	AC_SUBST(IDMAPD)
 	AC_SUBST(enable_nfsv4)
 	AM_CONDITIONAL(CONFIG_NFSV4, [test "$enable_nfsv4" = "yes"])
@@ -450,6 +453,7 @@ AC_CONFIG_FILES([
 	tools/mountstats/Makefile
 	tools/nfs-iostat/Makefile
 	utils/Makefile
+	utils/blkmapd/Makefile
 	utils/exportfs/Makefile
 	utils/gssd/Makefile
 	utils/idmapd/Makefile
diff --git a/utils/Makefile.am b/utils/Makefile.am
index a0ea116..0d222f0 100644
--- a/utils/Makefile.am
+++ b/utils/Makefile.am
@@ -9,6 +9,10 @@ OPTDIRS += nfsidmap
 endif
 endif
 
+if CONFIG_NFSV4
+OPTDIRS += blkmapd
+endif
+
 if CONFIG_GSS
 OPTDIRS += gssd
 endif
diff --git a/utils/blkmapd/Makefile.am b/utils/blkmapd/Makefile.am
new file mode 100644
index 0000000..70e299e
--- /dev/null
+++ b/utils/blkmapd/Makefile.am
@@ -0,0 +1,19 @@
+## Process this file with automake to produce Makefile.in
+
+#man8_MANS	= blkmapd.man
+
+AM_CFLAGS	+= -D_LARGEFILE64_SOURCE
+sbin_PROGRAMS	= blkmapd
+
+blkmapd_SOURCES = \
+	device-discovery.c \
+	device-inq.c \
+	device-process.c \
+	dm-device.c \
+	\
+	device-discovery.h
+
+blkmapd_LDADD = -ldevmapper ../../support/nfs/libnfs.a
+
+MAINTAINERCLEANFILES = Makefile.in
+
diff --git a/utils/blkmapd/device-discovery.c b/utils/blkmapd/device-discovery.c
new file mode 100644
index 0000000..c21de3e
--- /dev/null
+++ b/utils/blkmapd/device-discovery.c
@@ -0,0 +1,453 @@
+/*
+ * device-discovery.c: main function, discovering device and processing
+ * pipe request from kernel.
+ *
+ * Copyright (c) 2010 EMC Corporation, Haiying Tang <Tang_Haiying@xxxxxxx>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#include <sys/select.h>
+#include <linux/kdev_t.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_ioctl.h>
+#include <scsi/sg.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <syslog.h>
+#include <dirent.h>
+#include <ctype.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <libgen.h>
+#include <errno.h>
+#include <libdevmapper.h>
+
+#include "device-discovery.h"
+
+#define BL_PIPE_FILE	"/var/lib/nfs/rpc_pipefs/nfs/blocklayout"
+#define PID_FILE	"/var/run/blkmapd.pid"
+
+struct bl_disk *visible_disk_list;
+
+struct bl_disk_path *bl_get_path(const char *filepath,
+				 struct bl_disk_path *paths)
+{
+	struct bl_disk_path *tmp = paths;
+
+	while (tmp) {
+		if (!strcmp(tmp->full_path, filepath))
+			break;
+		tmp = tmp->next;
+	}
+	return tmp;
+}
+
+/* Check whether valid_path is a substring(partition) of path */
+int bl_is_partition(struct bl_disk_path *valid_path, struct bl_disk_path *path)
+{
+	if (!strncmp(valid_path->full_path, path->full_path,
+		     strlen(valid_path->full_path)))
+		return 1;
+
+	return 0;
+}
+
+/*
+ * For multipath devices, devices state could be PASSIVE/ACTIVE/PSEUDO,
+ * where PSEUDO > ACTIVE > PASSIVE. Device with highest state is used to
+ * create pseudo device. So if state is higher, the device path needs to
+ * be updated.
+ * If device-mapper multipath support is a must, pseudo devices should
+ * exist for each multipath device. If not, active device path will be
+ * chosen for device creation.
+ * Treat partition as invalid path.
+ */
+int bl_update_path(struct bl_disk_path *path, enum bl_path_state_e state,
+		   struct bl_disk *disk)
+{
+	struct bl_disk_path *valid_path = disk->valid_path;
+
+	if (valid_path) {
+		if (valid_path->state >= state) {
+			if (bl_is_partition(valid_path, path))
+				return 0;
+		}
+	}
+	return 1;
+}
+
+void bl_release_disk(void)
+{
+	struct bl_disk *disk;
+	struct bl_disk_path *path = NULL;
+
+	while (visible_disk_list) {
+		disk = visible_disk_list;
+		path = disk->paths;
+		while (path) {
+			disk->paths = path->next;
+			free(path->full_path);
+			free(path);
+			path = disk->paths;
+		}
+		if (disk->serial)
+			free(disk->serial);
+		visible_disk_list = disk->next;
+		free(disk);
+	}
+}
+
+void bl_add_disk(char *filepath)
+{
+	struct bl_disk *disk = NULL;
+	int fd = 0;
+	struct stat sb;
+	off_t size = 0;
+	struct bl_serial *serial = NULL;
+	enum bl_path_state_e ap_state;
+	struct bl_disk_path *diskpath = NULL, *path = NULL;
+	dev_t dev;
+
+	fd = open(filepath, O_RDONLY | O_LARGEFILE);
+	if (fd < 0)
+		return;
+
+	if (fstat(fd, &sb)) {
+		close(fd);
+		return;
+	}
+
+	if (!sb.st_size)
+		ioctl(fd, BLKGETSIZE, &size);
+	else
+		size = sb.st_size;
+
+	if (!size) {
+		close(fd);
+		return;
+	}
+
+	dev = sb.st_rdev;
+	serial = bldev_read_serial(fd, filepath);
+	if (dm_is_dm_major(major(dev)))
+		ap_state = BL_PATH_STATE_PSEUDO;
+	else
+		ap_state = bldev_read_ap_state(fd);
+	close(fd);
+
+	if (ap_state != BL_PATH_STATE_ACTIVE)
+		return;
+
+	for (disk = visible_disk_list; disk != NULL; disk = disk->next) {
+		/* Already scanned or a partition?
+		 * XXX: if released each time, maybe not need to compare
+		 */
+		if ((serial->len == disk->serial->len) &&
+		    !memcmp(serial->data, disk->serial->data, serial->len)) {
+			diskpath = bl_get_path(filepath, disk->paths);
+			break;
+		}
+	}
+
+	if (disk && diskpath)
+		return;
+
+	/* add path */
+	path = malloc(sizeof(struct bl_disk_path));
+	if (!path) {
+		BL_LOG_ERR("%s: Out of memory!\n", __func__);
+		goto out_err;
+	}
+	path->next = NULL;
+	path->state = ap_state;
+	path->full_path = strdup(filepath);
+	if (!path->full_path)
+		goto out_err;
+
+	if (!disk) {		/* add disk */
+		disk = malloc(sizeof(struct bl_disk));
+		if (!disk) {
+			BL_LOG_ERR("%s: Out of memory!\n", __func__);
+			goto out_err;
+		}
+		disk->next = visible_disk_list;
+		disk->dev = dev;
+		disk->size = size;
+		disk->serial = serial;
+		disk->valid_path = path;
+		disk->paths = path;
+		visible_disk_list = disk;
+	} else {
+		path->next = disk->paths;
+		disk->paths = path;
+		/* check whether we need to update disk info */
+		if (bl_update_path(path, path->state, disk)) {
+			disk->dev = dev;
+			disk->size = size;
+			disk->valid_path = path;
+		}
+	}
+	return;
+
+ out_err:
+	if (path) {
+		if (path->full_path)
+			free(path->full_path);
+		free(path);
+	}
+	return;
+}
+
+int bl_discover_devices(void)
+{
+	FILE *f;
+	int n;
+	char buf[PATH_MAX], devname[PATH_MAX], fulldevname[PATH_MAX];
+
+	/* release previous list */
+	bl_release_disk();
+
+	/* scan all block devices */
+	f = fopen("/proc/partitions", "r");
+	if (f == NULL)
+		return 0;
+
+	while (1) {
+		if (fgets(buf, sizeof buf, f) == NULL)
+			break;
+		n = sscanf(buf, "%*d %*d %*d %31s", devname);
+		if (n != 1)
+			continue;
+		snprintf(fulldevname, sizeof fulldevname, "/sys/block/%s",
+			 devname);
+		if (access(fulldevname, F_OK) < 0)
+			continue;
+		snprintf(fulldevname, sizeof fulldevname, "/dev/%s", devname);
+		bl_add_disk(fulldevname);
+	}
+
+	fclose(f);
+
+	return 0;
+}
+
+/* process kernel request
+ * return 0: request processed, and no more request waiting;
+ * return 1: request processed, and more requests waiting;
+ * return < 0: error
+ */
+int bl_disk_inquiry_process(int fd)
+{
+	int ret = 0;
+	struct bl_pipemsg_hdr head;
+	char *buf = NULL;
+	uint32_t major, minor;
+	uint16_t buflen;
+	struct bl_dev_msg reply;
+
+	/* read request */
+	if (atomicio(read, fd, &head, sizeof(head)) != sizeof(head)) {
+		/* Note that an error in this or the next read is pretty
+		 * catastrophic, as there is no good way to resync into
+		 * the pipe's stream.
+		 */
+		BL_LOG_ERR("Read pipefs head error!\n");
+		ret = -EIO;
+		goto out;
+	}
+
+	buflen = head.totallen;
+	buf = malloc(buflen);
+	if (!buf) {
+		BL_LOG_ERR("%s: Out of memory!\n", __func__);
+		ret = -ENOMEM;
+		goto out;
+	}
+
+	if (atomicio(read, fd, buf, buflen) != buflen) {
+		BL_LOG_ERR("Read pipefs content error!\n");
+		ret = -EIO;
+		goto out;
+	}
+
+	reply.status = BL_DEVICE_REQUEST_PROC;
+
+	switch (head.type) {
+	case BL_DEVICE_MOUNT:
+		/*
+		 * It shouldn't be necessary to discover devices here, since
+		 * process_deviceinfo() will re-discover if it can't find
+		 * the devices it needs.  But in the case of multipath
+		 * devices (ones that appear more than once, for example an
+		 * active and a standby LUN), this will re-order them in the
+		 * correct priority.
+		 */
+		bl_discover_devices();
+		if (!process_deviceinfo(buf, buflen, &major, &minor)) {
+			reply.status = BL_DEVICE_REQUEST_ERR;
+			break;
+		}
+		reply.major = major;
+		reply.minor = minor;
+		break;
+	case BL_DEVICE_UMOUNT:
+		if (!dm_device_remove_all((uint64_t *) buf))
+			reply.status = BL_DEVICE_REQUEST_ERR;
+		break;
+	default:
+		reply.status = BL_DEVICE_REQUEST_ERR;
+		break;
+	}
+
+	/* write to pipefs */
+	if (atomicio((void *)write, fd, &reply, sizeof(reply))
+	    != sizeof(reply)) {
+		BL_LOG_ERR("Write pipefs error!\n");
+		ret = -EIO;
+	}
+
+ out:
+	if (buf)
+		free(buf);
+	return ret;
+}
+
+/* TODO: set bl_process_stop to 1 in command */
+unsigned int bl_process_stop;
+
+int bl_run_disk_inquiry_process(int fd)
+{
+	fd_set rset;
+	int ret;
+
+	bl_process_stop = 0;
+
+	for (;;) {
+		if (bl_process_stop)
+			return 1;
+		FD_ZERO(&rset);
+		FD_SET(fd, &rset);
+		ret = 0;
+		switch (select(fd + 1, &rset, NULL, NULL, NULL)) {
+		case -1:
+			if (errno == EINTR)
+				continue;
+			else {
+				ret = -errno;
+				goto out;
+			}
+		case 0:
+			goto out;
+		default:
+			if (FD_ISSET(fd, &rset))
+				ret = bl_disk_inquiry_process(fd);
+		}
+	}
+ out:
+	return ret;
+}
+
+/* Daemon */
+int main(int argc, char **argv)
+{
+	int fd, pidfd = -1, opt, dflag = 0, fg = 0, ret = 1;
+	struct stat statbuf;
+	char pidbuf[64];
+
+	while ((opt = getopt(argc, argv, "df")) != -1) {
+		switch (opt) {
+		case 'd':
+			dflag = 1;
+			break;
+		case 'f':
+			fg = 1;
+			break;
+		}
+	}
+
+	if (fg) {
+		openlog("blkmapd", LOG_PERROR, 0);
+	} else {
+		if (!stat(PID_FILE, &statbuf)) {
+			fprintf(stderr, "Pid file %s already existed\n", PID_FILE);
+			exit(1);
+		}
+
+		if (daemon(0, 0) != 0) {
+			fprintf(stderr, "Daemonize failed\n");
+			exit(1);
+		}
+
+		openlog("blkmapd", LOG_PID, 0);
+		pidfd = open(PID_FILE, O_WRONLY | O_CREAT, 0644);
+		if (pidfd < 0) {
+			BL_LOG_ERR("Create pid file %s failed\n", PID_FILE);
+			exit(1);
+		}
+
+		if (lockf(pidfd, F_TLOCK, 0) < 0) {
+			BL_LOG_ERR("Lock pid file %s failed\n", PID_FILE);
+			close(pidfd);
+			exit(1);
+		}
+		ftruncate(pidfd, 0);
+		sprintf(pidbuf, "%d\n", getpid());
+		write(pidfd, pidbuf, strlen(pidbuf));
+	}
+
+	if (dflag) {
+		bl_discover_devices();
+		exit(0);
+	}
+
+	/* open pipe file */
+	fd = open(BL_PIPE_FILE, O_RDWR);
+	if (fd < 0) {
+		BL_LOG_ERR("open pipe file %s error\n", BL_PIPE_FILE);
+		exit(1);
+	}
+
+	while (1) {
+		/* discover device when needed */
+		bl_discover_devices();
+
+		ret = bl_run_disk_inquiry_process(fd);
+		if (ret < 0) {
+			/* what should we do with process error? */
+			BL_LOG_ERR("inquiry process return %d\n", ret);
+		}
+	}
+
+	if (pidfd >= 0) {
+		close(pidfd);
+		unlink(PID_FILE);
+	}
+
+	exit(ret);
+}
diff --git a/utils/blkmapd/device-discovery.h b/utils/blkmapd/device-discovery.h
new file mode 100644
index 0000000..a86eed9
--- /dev/null
+++ b/utils/blkmapd/device-discovery.h
@@ -0,0 +1,162 @@
+/*
+ * bl-device-discovery.h
+ *
+ * Copyright (c) 2010 EMC Corporation, Haiying Tang <Tang_Haiying@xxxxxxx>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#ifndef BL_DEVICE_DISCOVERY_H
+#define BL_DEVICE_DISCOVERY_H
+
+#include <stdint.h>
+
+enum blk_vol_type {
+	BLOCK_VOLUME_SIMPLE = 0,	/* maps to a single LU */
+	BLOCK_VOLUME_SLICE = 1,		/* slice of another volume */
+	BLOCK_VOLUME_CONCAT = 2,	/* concatenation of multiple volumes */
+	BLOCK_VOLUME_STRIPE = 3,	/* striped across multiple volumes */
+	BLOCK_VOLUME_PSEUDO = 4,
+};
+
+/* All disk offset/lengths are stored in 512-byte sectors */
+struct bl_volume {
+	uint32_t bv_type;
+	off_t bv_size;
+	struct bl_volume **bv_vols;
+	int bv_vol_n;
+	union {
+		dev_t bv_dev;		/* for BLOCK_VOLUME_SIMPLE(PSEUDO) */
+		off_t bv_stripe_unit;	/* for BLOCK_VOLUME_STRIPE(CONCAT) */
+		off_t bv_offset;	/* for BLOCK_VOLUME_SLICE */
+	} param;
+};
+
+struct bl_sig_comp {
+	int64_t bs_offset;		/* In bytes */
+	uint32_t bs_length;		/* In bytes */
+	char *bs_string;
+};
+
+/* Maximum number of signatures components in a simple volume */
+# define BLOCK_MAX_SIG_COMP 16
+
+struct bl_sig {
+	int si_num_comps;
+	struct bl_sig_comp si_comps[BLOCK_MAX_SIG_COMP];
+};
+
+/*
+ * Multipath support: ACTIVE or PSEUDO device is valid,
+ *		      PASSIVE is a standby for ACTIVE.
+ */
+enum bl_path_state_e {
+	BL_PATH_STATE_PASSIVE = 1,
+	BL_PATH_STATE_ACTIVE = 2,
+	BL_PATH_STATE_PSEUDO = 3,
+};
+
+struct bl_serial {
+	int len;
+	char *data;
+};
+
+struct bl_disk_path {
+	struct bl_disk_path *next;
+	char *full_path;
+	enum bl_path_state_e state;
+};
+
+struct bl_disk {
+	struct bl_disk *next;
+	struct bl_serial *serial;
+	dev_t dev;
+	off_t size;			/* in 512-byte sectors */
+	struct bl_disk_path *valid_path;
+	struct bl_disk_path *paths;
+};
+
+struct bl_dev_id {
+	unsigned char type;
+	unsigned char ids;
+	unsigned char reserve;
+	unsigned char len;
+	char data[0];
+};
+
+struct bl_dev_msg {
+	int status;
+	uint32_t major, minor;
+};
+
+struct bl_pipemsg_hdr {
+	uint8_t type;
+	uint16_t totallen;		/* length of message excluding hdr */
+};
+
+#define BL_DEVICE_UMOUNT                0x0	/* Umount--delete devices */
+#define BL_DEVICE_MOUNT                 0x1	/* Mount--create devices */
+#define BL_DEVICE_REQUEST_INIT          0x0	/* Start request */
+#define BL_DEVICE_REQUEST_PROC          0x1	/* User process succeeds */
+#define BL_DEVICE_REQUEST_ERR           0x2	/* User process fails */
+
+uint32_t *blk_overflow(uint32_t * p, uint32_t * end, size_t nbytes);
+
+#define BLK_READBUF(p, e, nbytes)  do { \
+	p = blk_overflow(p, e, nbytes); \
+	if (!p) {\
+		goto out_err;\
+	} \
+} while (0)
+
+#define READ32(x)         (x) = ntohl(*p++)
+
+#define READ64(x)         do {                  \
+	(x) = (uint64_t)ntohl(*p++) << 32;           \
+	(x) |= ntohl(*p++);                     \
+} while (0)
+
+#define READ_SECTOR(x)     do { \
+	READ64(tmp); \
+	if (tmp & 0x1ff) { \
+		goto out_err; \
+	} \
+	(x) = tmp >> 9; \
+} while (0)
+
+extern struct bl_disk *visible_disk_list;
+uint64_t dm_device_create(struct bl_volume *vols, int num_vols);
+int dm_device_remove_all(uint64_t *dev);
+uint64_t process_deviceinfo(const char *dev_addr_buf,
+			    unsigned int dev_addr_len,
+			    uint32_t *major, uint32_t *minor);
+
+extern ssize_t atomicio(ssize_t(*f) (int, void *, size_t),
+			int fd, void *_s, size_t n);
+extern struct bl_serial *bldev_read_serial(int fd, const char *filename);
+extern enum bl_path_state_e bldev_read_ap_state(int fd);
+extern int bl_discover_devices(void);
+
+#define BL_LOG_INFO(fmt...)		syslog(LOG_INFO, fmt)
+#define BL_LOG_WARNING(fmt...)		syslog(LOG_WARNING, fmt)
+#define BL_LOG_ERR(fmt...)		syslog(LOG_ERR, fmt)
+#define BL_LOG_DEBUG(fmt...)		syslog(LOG_DEBUG, fmt)
+#endif
diff --git a/utils/blkmapd/device-inq.c b/utils/blkmapd/device-inq.c
new file mode 100644
index 0000000..eabc70c
--- /dev/null
+++ b/utils/blkmapd/device-inq.c
@@ -0,0 +1,233 @@
+/*
+ * device-inq.c: inquire SCSI device information.
+ *
+ * Copyright (c) 2010 EMC Corporation, Haiying Tang <Tang_Haiying@xxxxxxx>
+ * All rights reserved.
+ *
+ * This program refers to "SCSI Primary Commands - 3 (SPC-3)
+ * at http://www.t10.org and sg_inq.c in sg3_utils-1.26 for
+ * Linux OS SCSI subsystem, by D. Gilbert.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <sys/mount.h>
+#include <sys/select.h>
+#include <scsi/scsi.h>
+#include <scsi/scsi_ioctl.h>
+#include <scsi/sg.h>
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+#include <syslog.h>
+#include <dirent.h>
+#include <ctype.h>
+#include <fcntl.h>
+#include <libgen.h>
+#include <errno.h>
+
+#include "device-discovery.h"
+
+#define DEF_ALLOC_LEN	255
+#define MX_ALLOC_LEN	(0xc000 + 0x80)
+
+static struct bl_serial *bl_create_scsi_string(int len, const char *bytes)
+{
+	struct bl_serial *s;
+
+	s = malloc(sizeof(*s) + len);
+	if (s) {
+		s->data = (char *)&s[1];
+		s->len = len;
+		memcpy(s->data, bytes, len);
+	}
+	return s;
+}
+
+static void bl_free_scsi_string(struct bl_serial *str)
+{
+	if (str)
+		free(str);
+}
+
+#define sg_io_ok(io_hdr) \
+	((((io_hdr).status & 0x7e) == 0) && \
+	((io_hdr).host_status == 0) && \
+	(((io_hdr).driver_status & 0x0f) == 0))
+
+static int sg_timeout = 1 * 1000;
+
+static int bldev_inquire_page(int fd, int page, char *buffer, int len)
+{
+	unsigned char cmd[] = { INQUIRY, 0, 0, 0, 0, 0 };
+	unsigned char sense_b[28];
+	struct sg_io_hdr io_hdr;
+	if (page >= 0) {
+		cmd[1] = 1;
+		cmd[2] = page;
+	}
+	cmd[3] = (unsigned char)((len >> 8) & 0xff);
+	cmd[4] = (unsigned char)(len & 0xff);
+
+	memset(&io_hdr, 0, sizeof(struct sg_io_hdr));
+	io_hdr.interface_id = 'S';
+	io_hdr.cmd_len = sizeof(cmd);
+	io_hdr.mx_sb_len = sizeof(sense_b);
+	io_hdr.dxfer_direction = SG_DXFER_FROM_DEV;
+	io_hdr.dxfer_len = len;
+	io_hdr.dxferp = buffer;
+	io_hdr.cmdp = cmd;
+	io_hdr.sbp = sense_b;
+	io_hdr.timeout = sg_timeout;
+	if (ioctl(fd, SG_IO, &io_hdr) < 0)
+		return -1;
+
+	if (sg_io_ok(io_hdr))
+		return 0;
+	return -1;
+}
+
+static int bldev_inquire_pages(int fd, int page, char **buffer)
+{
+	int status = 0;
+	char *tmp;
+	int len;
+
+	*buffer = calloc(DEF_ALLOC_LEN, sizeof(char));
+	if (!*buffer) {
+		BL_LOG_ERR("%s: Out of memory!\n", __func__);
+		return -ENOMEM;
+	}
+
+	status = bldev_inquire_page(fd, page, *buffer, DEF_ALLOC_LEN);
+	if (status)
+		goto out;
+
+	status = -1;
+	if ((*(*buffer + 1) & 0xff) != page)
+		goto out;
+
+	len = (*(*buffer + 2) << 8) + *(*buffer + 3) + 4;
+	if (len > MX_ALLOC_LEN) {
+		BL_LOG_ERR("SCSI response length too long: %d\n", len);
+		goto out;
+	}
+	if (len > DEF_ALLOC_LEN) {
+		tmp = realloc(*buffer, len);
+		if (!tmp) {
+			BL_LOG_ERR("%s: Out of memory!\n", __func__);
+			status = -ENOMEM;
+			goto out;
+		}
+		*buffer = tmp;
+		status = bldev_inquire_page(fd, page, *buffer, len);
+		if (status)
+			goto out;
+	}
+	status = 0;
+ out:
+	return status;
+}
+
+/* For EMC multipath devices, use VPD page (0xc0) to get status.
+ * For other devices, return ACTIVE for now
+ */
+extern enum bl_path_state_e bldev_read_ap_state(int fd)
+{
+	int status = 0;
+	char *buffer = NULL;
+	enum bl_path_state_e ap_state = BL_PATH_STATE_ACTIVE;
+
+	status = bldev_inquire_pages(fd, 0xc0, &buffer);
+	if (status)
+		goto out;
+
+	if (buffer[4] < 0x02)
+		ap_state = BL_PATH_STATE_PASSIVE;
+ out:
+	if (buffer)
+		free(buffer);
+	return ap_state;
+}
+
+struct bl_serial *bldev_read_serial(int fd, const char *filename)
+{
+	struct bl_serial *serial_out = NULL;
+	int status = 0;
+	char *buffer;
+	struct bl_dev_id *dev_root, *dev_id;
+	unsigned int pos, len, current_id = 0;
+
+	status = bldev_inquire_pages(fd, 0x83, &buffer);
+	if (status)
+		goto out;
+
+	dev_root = (struct bl_dev_id *)buffer;
+
+	pos = 0;
+	current_id = 0;
+	len = dev_root->len;
+	while (pos < (len - sizeof(struct bl_dev_id) + sizeof(unsigned char))) {
+		dev_id = (struct bl_dev_id *)&(dev_root->data[pos]);
+		if ((dev_id->ids & 0xf) < current_id)
+			continue;
+		switch (dev_id->ids & 0xf) {
+			/* We process SCSI ID with four ID cases: 0, 1, 2 and 3.
+			 * When more than one ID is available, priority is
+			 * 3>2>1>0.
+			 */
+		case 2:	/* EUI-64 based */
+			if ((dev_id->len != 8) && (dev_id->len != 12) &&
+			    (dev_id->len != 16))
+				break;
+		case 3:	/* NAA */
+			/* TODO: NAA validity judgement too complicated,
+			 * so just ingore it here.
+			 */
+			if ((dev_id->type & 0xf) != 1) {
+				BL_LOG_ERR("Binary code_set expected\n");
+				break;
+			}
+		case 0:	/* vendor specific */
+		case 1:	/* T10 vendor identification */
+			current_id = dev_id->ids & 0xf;
+			if (serial_out)
+				bl_free_scsi_string(serial_out);
+			serial_out = bl_create_scsi_string(dev_id->len,
+							   dev_id->data);
+			break;
+		}
+		if (current_id == 3)
+			break;
+		pos += (dev_id->len + sizeof(struct bl_dev_id) -
+			sizeof(unsigned char));
+	}
+ out:
+	if (!serial_out)
+		serial_out = bl_create_scsi_string(strlen(filename), filename);
+	if (buffer)
+		free(buffer);
+	return serial_out;
+}
diff --git a/utils/blkmapd/device-process.c b/utils/blkmapd/device-process.c
new file mode 100644
index 0000000..27ff374
--- /dev/null
+++ b/utils/blkmapd/device-process.c
@@ -0,0 +1,407 @@
+/*
+ * device-process.c: detailed processing of device information sent
+ * from kernel.
+ *
+ * Copyright (c) 2006 The Regents of the University of Michigan.
+ * All rights reserved.
+ *
+ *  Andy Adamson <andros@xxxxxxxxxxxxxx>
+ *  Fred Isaman <iisaman@xxxxxxxxx>
+ *
+ * Copyright (c) 2010 EMC Corporation, Haiying Tang <Tang_Haiying@xxxxxxx>
+ *
+ * Used codes in linux/fs/nfs/blocklayout/blocklayoutdev.c.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/user.h>
+#include <arpa/inet.h>
+#include <linux/kdev_t.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <errno.h>
+
+#include "device-discovery.h"
+
+static char *pretty_sig(char *sig, uint32_t siglen)
+{
+	static char rs[100];
+	uint64_t sigval;
+	unsigned int i;
+
+	if (siglen <= sizeof(sigval)) {
+		sigval = 0;
+		for (i = 0; i < siglen; i++)
+			sigval |= ((unsigned char *)sig)[i] << (i * 8);
+		sprintf(rs, "0x%0llx", (unsigned long long) sigval);
+	} else {
+		if (siglen > sizeof rs - 4) {
+			siglen = sizeof rs - 4;
+			sprintf(&rs[siglen], "...");
+		} else
+			rs[siglen] = '\0';
+		memcpy(rs, sig, siglen);
+	}
+	return rs;
+}
+
+uint32_t *blk_overflow(uint32_t * p, uint32_t * end, size_t nbytes)
+{
+	uint32_t *q = p + ((nbytes + 3) >> 2);
+
+	if (q > end || q < p)
+		return NULL;
+	return p;
+}
+
+static int decode_blk_signature(uint32_t **pp, uint32_t * end,
+				struct bl_sig *sig)
+{
+	int i;
+	uint32_t siglen, *p = *pp;
+
+	BLK_READBUF(p, end, 4);
+	READ32(sig->si_num_comps);
+	if (sig->si_num_comps == 0) {
+		BL_LOG_ERR("0 components in sig\n");
+		goto out_err;
+	}
+	if (sig->si_num_comps >= BLOCK_MAX_SIG_COMP) {
+		BL_LOG_ERR("number of sig comps %i >= BLOCK_MAX_SIG_COMP\n",
+			   sig->si_num_comps);
+		goto out_err;
+	}
+	for (i = 0; i < sig->si_num_comps; i++) {
+		struct bl_sig_comp *comp = &sig->si_comps[i];
+
+		BLK_READBUF(p, end, 12);
+		READ64(comp->bs_offset);
+		READ32(siglen);
+		comp->bs_length = siglen;
+		BLK_READBUF(p, end, siglen);
+		/* Note we rely here on fact that sig is used immediately
+		 * for mapping, then thrown away.
+		 */
+		comp->bs_string = (char *)p;
+		BL_LOG_INFO("%s: si_comps[%d]: bs_length %d, bs_string %s\n",
+			    __func__, i, siglen,
+			    pretty_sig(comp->bs_string, siglen));
+		p += ((siglen + 3) >> 2);
+	}
+	*pp = p;
+	return 0;
+ out_err:
+	return -EIO;
+}
+
+/*
+ * Read signature from device and compare to sig_comp
+ * return: 0=match, 1=no match, -1=error
+ */
+static int
+read_cmp_blk_sig(struct bl_disk *disk, int fd, struct bl_sig_comp *comp)
+{
+	const char *dev_name = disk->valid_path->full_path;
+	int ret = -1;
+	ssize_t siglen = comp->bs_length;
+	int64_t bs_offset = comp->bs_offset;
+	char *sig = NULL;
+
+	sig = (char *)malloc(siglen);
+	if (!sig) {
+		BL_LOG_ERR("%s: Out of memory\n", __func__);
+		goto out;
+	}
+
+	if (bs_offset < 0)
+		bs_offset += (((int64_t) disk->size) << 9);
+	if (lseek64(fd, bs_offset, SEEK_SET) == -1) {
+		BL_LOG_ERR("File %s lseek error\n", dev_name);
+		goto out;
+	}
+
+	if (read(fd, sig, siglen) != siglen) {
+		BL_LOG_ERR("File %s read error\n", dev_name);
+		goto out;
+	}
+
+	ret = memcmp(sig, comp->bs_string, siglen);
+	if (!ret)
+		BL_LOG_INFO("%s: %s sig %s at %lld\n", __func__, dev_name,
+			    pretty_sig(sig, siglen),
+			    (long long)comp->bs_offset);
+
+ out:
+	if (sig)
+		free(sig);
+	return ret;
+}
+
+/*
+ * All signatures in sig must be found on disk for verification.
+ * Returns True if sig matches, False otherwise.
+ */
+static int verify_sig(struct bl_disk *disk, struct bl_sig *sig)
+{
+	const char *dev_name = disk->valid_path->full_path;
+	int fd, i, rv;
+
+	fd = open(dev_name, O_RDONLY | O_LARGEFILE);
+	if (fd < 0) {
+		BL_LOG_ERR("%s: %s could not be opened for read\n", __func__,
+			   dev_name);
+		return 0;
+	}
+
+	rv = 1;
+
+	for (i = 0; i < sig->si_num_comps; i++) {
+		if (read_cmp_blk_sig(disk, fd, &sig->si_comps[i])) {
+			rv = 0;
+			break;
+		}
+	}
+
+	if (fd >= 0)
+		close(fd);
+	return rv;
+}
+
+/*
+ * map_sig_to_device()
+ * Given a signature, walk the list of visible disks searching for
+ * a match. Returns True if mapping was done, False otherwise.
+ *
+ * While we're at it, fill in the vol->bv_size.
+ */
+static int map_sig_to_device(struct bl_sig *sig, struct bl_volume *vol)
+{
+	int mapped = 0;
+	struct bl_disk *disk;
+
+	/* scan disk list to find out match device */
+	for (disk = visible_disk_list; disk; disk = disk->next) {
+		/* FIXME: should we use better algorithm for disk scan? */
+		mapped = verify_sig(disk, sig);
+		if (mapped) {
+			vol->param.bv_dev = disk->dev;
+			vol->bv_size = disk->size;
+			break;
+		}
+	}
+	return mapped;
+}
+
+/* We are given an array of XDR encoded array indices, each of which should
+ * refer to a previously decoded device.  Translate into a list of pointers
+ * to the appropriate pnfs_blk_volume's.
+ */
+static int set_vol_array(uint32_t **pp, uint32_t *end,
+			 struct bl_volume *vols, int working)
+{
+	int i, index;
+	uint32_t *p = *pp;
+	struct bl_volume **array = vols[working].bv_vols;
+
+	for (i = 0; i < vols[working].bv_vol_n; i++) {
+		BLK_READBUF(p, end, 4);
+		READ32(index);
+		if ((index < 0) || (index >= working)) {
+			BL_LOG_ERR("set_vol_array: Id %i out of range\n",
+				   index);
+			goto out_err;
+		}
+		array[i] = &vols[index];
+	}
+	*pp = p;
+	return 0;
+ out_err:
+	return -EIO;
+}
+
+static uint64_t sum_subvolume_sizes(struct bl_volume *vol)
+{
+	int i;
+	uint64_t sum = 0;
+
+	for (i = 0; i < vol->bv_vol_n; i++)
+		sum += vol->bv_vols[i]->bv_size;
+	return sum;
+}
+
+static int
+decode_blk_volume(uint32_t **pp, uint32_t *end, struct bl_volume *vols, int voln,
+		  int *array_cnt)
+{
+	int status = 0, j;
+	struct bl_sig sig;
+	uint32_t *p = *pp;
+	struct bl_volume *vol = &vols[voln];
+	uint64_t tmp;
+
+	BLK_READBUF(p, end, 4);
+	READ32(vol->bv_type);
+
+	switch (vol->bv_type) {
+	case BLOCK_VOLUME_SIMPLE:
+		*array_cnt = 0;
+		status = decode_blk_signature(&p, end, &sig);
+		if (status)
+			return status;
+		status = map_sig_to_device(&sig, vol);
+		if (!status) {
+			BL_LOG_ERR("Could not find disk for device\n");
+			return -ENXIO;
+		}
+		BL_LOG_INFO("%s: simple %d\n", __func__, voln);
+		status = 0;
+		break;
+	case BLOCK_VOLUME_SLICE:
+		BLK_READBUF(p, end, 16);
+		READ_SECTOR(vol->param.bv_offset);
+		READ_SECTOR(vol->bv_size);
+		*array_cnt = vol->bv_vol_n = 1;
+		BL_LOG_INFO("%s: slice %d\n", __func__, voln);
+		status = set_vol_array(&p, end, vols, voln);
+		break;
+	case BLOCK_VOLUME_STRIPE:
+		BLK_READBUF(p, end, 8);
+		READ_SECTOR(vol->param.bv_stripe_unit);
+		off_t stripe_unit = vol->param.bv_stripe_unit;
+		/* Check limitations imposed by device-mapper */
+		if ((stripe_unit & (stripe_unit - 1)) != 0
+		    || stripe_unit < (off_t) (PAGE_SIZE >> 9))
+			return -EIO;
+		BLK_READBUF(p, end, 4);
+		READ32(vol->bv_vol_n);
+		if (!vol->bv_vol_n)
+			return -EIO;
+		*array_cnt = vol->bv_vol_n;
+		BL_LOG_INFO("%s: stripe %d nvols=%d unit=%ld\n", __func__, voln,
+			    vol->bv_vol_n, (long)stripe_unit);
+		status = set_vol_array(&p, end, vols, voln);
+		if (status)
+			return status;
+		for (j = 1; j < vol->bv_vol_n; j++) {
+			if (vol->bv_vols[j]->bv_size !=
+			    vol->bv_vols[0]->bv_size) {
+				BL_LOG_ERR("varying subvol size\n");
+				return -EIO;
+			}
+		}
+		vol->bv_size = vol->bv_vols[0]->bv_size * vol->bv_vol_n;
+		break;
+	case BLOCK_VOLUME_CONCAT:
+		BLK_READBUF(p, end, 4);
+		READ32(vol->bv_vol_n);
+		if (!vol->bv_vol_n)
+			return -EIO;
+		*array_cnt = vol->bv_vol_n;
+		BL_LOG_INFO("%s: concat %d %d\n", __func__, voln,
+			    vol->bv_vol_n);
+		status = set_vol_array(&p, end, vols, voln);
+		if (status)
+			return status;
+		vol->bv_size = sum_subvolume_sizes(vol);
+		break;
+	default:
+		BL_LOG_ERR("Unknown volume type %i\n", vol->bv_type);
+ out_err:
+		return -EIO;
+	}
+	*pp = p;
+	return status;
+}
+
+uint64_t process_deviceinfo(const char *dev_addr_buf,
+			    unsigned int dev_addr_len,
+			    uint32_t *major, uint32_t *minor)
+{
+	int num_vols, i, status, count;
+	uint32_t *p, *end;
+	struct bl_volume *vols = NULL, **arrays = NULL, **arrays_ptr = NULL;
+	uint64_t dev = 0;
+
+	p = (uint32_t *) dev_addr_buf;
+	end = (uint32_t *) ((char *)p + dev_addr_len);
+
+	/* Decode block volume */
+	BLK_READBUF(p, end, 4);
+	READ32(num_vols);
+	BL_LOG_INFO("%s: %d vols\n", __func__, num_vols);
+	if (num_vols <= 0)
+		goto out_err;
+
+	vols = (struct bl_volume *)malloc(num_vols * sizeof(struct bl_volume));
+	if (!vols) {
+		BL_LOG_ERR("%s: Out of memory\n", __func__);
+		goto out_err;
+	}
+
+	/* Each volume in vols array needs its own array.  Save time by
+	 * allocating them all in one large hunk.  Because each volume
+	 * array can only reference previous volumes, and because once
+	 * a concat or stripe references a volume, it may never be
+	 * referenced again, the volume arrays are guaranteed to fit
+	 * in the suprisingly small space allocated.
+	 */
+	arrays_ptr = arrays =
+	    (struct bl_volume **)malloc(num_vols * 2 *
+					sizeof(struct bl_volume *));
+	if (!arrays) {
+		BL_LOG_ERR("%s: Out of memory\n", __func__);
+		goto out_err;
+	}
+
+	for (i = 0; i < num_vols; i++) {
+		vols[i].bv_vols = arrays_ptr;
+		status = decode_blk_volume(&p, end, vols, i, &count);
+		if (status)
+			goto out_err;
+		arrays_ptr += count;
+	}
+
+	if (p != end) {
+		BL_LOG_ERR("p is not equal to end!\n");
+		goto out_err;
+	}
+
+	dev = dm_device_create(vols, num_vols);
+	if (dev) {
+		*major = MAJOR(dev);
+		*minor = MINOR(dev);
+	}
+
+ out_err:
+	if (vols)
+		free(vols);
+	if (arrays)
+		free(arrays);
+	return dev;
+}
diff --git a/utils/blkmapd/dm-device.c b/utils/blkmapd/dm-device.c
new file mode 100644
index 0000000..0f4f148
--- /dev/null
+++ b/utils/blkmapd/dm-device.c
@@ -0,0 +1,518 @@
+/*
+ * dm-device.c: create or remove device via device mapper API.
+ *
+ * Copyright (c) 2010 EMC Corporation, Haiying Tang <Tang_Haiying@xxxxxxx>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <linux/kdev_t.h>
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+#include <syslog.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <libdevmapper.h>
+
+#include "device-discovery.h"
+
+#define DM_DEV_NAME_LEN		256
+
+#ifndef DM_MAX_TYPE_NAME
+#define DM_MAX_TYPE_NAME	16
+#endif
+
+#define DM_PARAMS_LEN		512	/* XXX: is this enough for target? */
+#define TYPE_HAS_DEV(type)	((type == BLOCK_VOLUME_SIMPLE) || \
+			 (type == BLOCK_VOLUME_PSEUDO))
+
+struct bl_dm_table {
+	uint64_t offset;
+	uint64_t size;
+	char target_type[DM_MAX_TYPE_NAME];
+	char params[DM_PARAMS_LEN];
+	struct bl_dm_table *next;
+};
+
+struct bl_dm_tree {
+	uint64_t dev;
+	struct dm_tree *tree;
+	struct bl_dm_tree *next;
+};
+
+static const char dm_name[] = "pnfs_vol_%u";
+
+static unsigned int dev_count;
+
+static inline struct bl_dm_table *bl_dm_table_alloc(void)
+{
+	return (struct bl_dm_table *)calloc(1, sizeof(struct bl_dm_table));
+}
+
+static void bl_dm_table_free(struct bl_dm_table *bl_table_head)
+{
+	struct bl_dm_table *p;
+
+	while (bl_table_head) {
+		p = bl_table_head->next;
+		free(bl_table_head);
+		bl_table_head = p;
+	}
+}
+
+static void add_to_bl_dm_table(struct bl_dm_table **bl_table_head,
+			struct bl_dm_table *table)
+{
+	struct bl_dm_table *p;
+
+	if (!*bl_table_head) {
+		*bl_table_head = table;
+		return;
+	}
+	p = *bl_table_head;
+	while (p->next)
+		p = p->next;
+	p->next = table;
+}
+
+struct bl_dm_tree *bl_tree_head;
+
+static struct bl_dm_tree *find_bl_dm_tree(uint64_t dev)
+{
+	struct bl_dm_tree *p;
+
+	for (p = bl_tree_head; p; p = p->next) {
+		if (p->dev == dev)
+			break;
+	}
+	return p;
+}
+
+static void del_from_bl_dm_tree(uint64_t dev)
+{
+	struct bl_dm_tree *p, *pre = bl_tree_head;
+
+	for (p = pre; p; p = p->next) {
+		if (p->dev == dev) {
+			pre->next = p->next;
+			if (p == bl_tree_head)
+				bl_tree_head = bl_tree_head->next;
+			free(p);
+			break;
+		}
+		pre = p;
+	}
+}
+
+static void add_to_bl_dm_tree(struct bl_dm_tree *tree)
+{
+	struct bl_dm_tree *p;
+
+	if (!bl_tree_head) {
+		bl_tree_head = tree;
+		return;
+	}
+	p = bl_tree_head;
+	while (p->next)
+		p = p->next;
+	p->next = tree;
+	return;
+}
+
+/*
+ * Create device via device mapper
+ * return 0 when creation failed
+ * return dev no for created device
+ */
+static uint64_t
+dm_device_create_mapped(const char *dev_name, struct bl_dm_table *p)
+{
+	struct dm_task *dmt;
+	struct dm_info dminfo;
+	int ret = 0;
+
+	dmt = dm_task_create(DM_DEVICE_CREATE);
+	if (!dmt) {
+		BL_LOG_ERR("Create dm_task for %s failed\n", dev_name);
+		return 0;
+	}
+	ret = dm_task_set_name(dmt, dev_name);
+	if (!ret)
+		goto err_out;
+
+	while (p) {
+		ret =
+		    dm_task_add_target(dmt, p->offset, p->size, p->target_type,
+				       p->params);
+		if (!ret)
+			goto err_out;
+		p = p->next;
+	}
+
+	ret = dm_task_run(dmt) && dm_task_get_info(dmt, &dminfo)
+	    && dminfo.exists;
+
+	if (!ret)
+		goto err_out;
+
+	dm_task_update_nodes();
+
+ err_out:
+	dm_task_destroy(dmt);
+
+	if (!ret) {
+		BL_LOG_ERR("Create device %s failed\n", dev_name);
+		return 0;
+	}
+	return MKDEV(dminfo.major, dminfo.minor);
+}
+
+static int dm_device_remove_byname(const char *dev_name)
+{
+	struct dm_task *dmt;
+	int ret = 0;
+
+	BL_LOG_INFO("%s: %s\n", __func__, dev_name);
+
+	dmt = dm_task_create(DM_DEVICE_REMOVE);
+	if (!dmt)
+		return 0;
+
+	ret = dm_task_set_name(dmt, dev_name) && dm_task_run(dmt);
+
+	dm_task_update_nodes();
+	dm_task_destroy(dmt);
+
+	return ret;
+}
+
+int dm_device_remove(uint64_t dev)
+{
+	struct dm_task *dmt;
+	struct dm_names *dmnames;
+	char *name = NULL;
+	int ret = 0;
+
+	/* Look for dev_name via dev, if dev_name could be transferred here,
+	   we could jump to DM_DEVICE_REMOVE directly */
+
+	dmt = dm_task_create(DM_DEVICE_LIST);
+	if (!dmt) {
+		BL_LOG_ERR("dm_task creation failed\n");
+		goto out;
+	}
+
+	ret = dm_task_run(dmt);
+	if (!ret) {
+		BL_LOG_ERR("dm_task_run failed\n");
+		goto out;
+	}
+
+	dmnames = dm_task_get_names(dmt);
+	if (!dmnames || !dmnames->dev) {
+		BL_LOG_ERR("dm_task_get_names failed\n");
+		goto out;
+	}
+
+	while (dmnames) {
+		if (dmnames->dev == dev) {
+			name = strdup(dmnames->name);
+			break;
+		}
+		dmnames = (void *)dmnames + dmnames->next;
+	}
+
+	if (!name) {
+		BL_LOG_ERR("Could not find device\n");
+		goto out;
+	}
+
+	dm_task_update_nodes();
+
+ out:
+	if (dmt)
+		dm_task_destroy(dmt);
+
+	/* Start to remove device */
+	if (name) {
+		ret = dm_device_remove_byname(name);
+		free(name);
+	}
+
+	return ret;
+}
+
+static void dm_devicelist_remove(unsigned int start, unsigned int end)
+{
+	char dev_name[DM_DEV_NAME_LEN];
+	unsigned int count;
+
+	if (start >= dev_count || end <= 1 || start >= end - 1)
+		return;
+
+	for (count = end - 1; count > start; count--) {
+		snprintf(dev_name, sizeof dev_name, dm_name, count - 1);
+		dm_device_remove_byname(dev_name);
+	}
+
+	return;
+}
+
+static void bl_dm_remove_tree(uint64_t dev)
+{
+	struct bl_dm_tree *p;
+
+	p = find_bl_dm_tree(dev);
+	if (!p)
+		return;
+
+	dm_tree_free(p->tree);
+	del_from_bl_dm_tree(dev);
+}
+
+static int bl_dm_create_tree(uint64_t dev)
+{
+	struct dm_tree *tree;
+	struct bl_dm_tree *bl_tree;
+
+	bl_tree = find_bl_dm_tree(dev);
+	if (bl_tree)
+		return 1;
+
+	tree = dm_tree_create();
+	if (!tree)
+		return 0;
+
+	if (!dm_tree_add_dev(tree, MAJOR(dev), MINOR(dev))) {
+		dm_tree_free(tree);
+		return 0;
+	}
+
+	bl_tree = malloc(sizeof(struct bl_dm_tree));
+	if (!bl_tree) {
+		dm_tree_free(tree);
+		return 0;
+	}
+
+	bl_tree->dev = dev;
+	bl_tree->tree = tree;
+	bl_tree->next = NULL;
+	add_to_bl_dm_tree(bl_tree);
+
+	return 1;
+}
+
+int dm_device_remove_all(uint64_t *dev)
+{
+	struct bl_dm_tree *p;
+	struct dm_tree_node *node;
+	const char *uuid;
+	int ret = 0;
+	uint32_t major, minor;
+	uint64_t bl_dev;
+
+	memcpy(&major, dev, sizeof(uint32_t));
+	memcpy(&minor, (void *)dev + sizeof(uint32_t), sizeof(uint32_t));
+	bl_dev = MKDEV(major, minor);
+	p = find_bl_dm_tree(bl_dev);
+	if (!p)
+		return ret;
+
+	node = dm_tree_find_node(p->tree, MAJOR(bl_dev), MINOR(bl_dev));
+	if (!node)
+		return ret;
+
+	uuid = dm_tree_node_get_uuid(node);
+	if (!uuid)
+		return ret;
+
+	dm_device_remove(bl_dev);
+	ret = dm_tree_deactivate_children(node, uuid, strlen(uuid));
+	dm_task_update_nodes();
+	bl_dm_remove_tree(bl_dev);
+
+	return ret;
+}
+
+static int dm_device_exists(char *dev_name)
+{
+	char fullname[DM_DEV_NAME_LEN];
+
+	snprintf(fullname, sizeof fullname, "/dev/mapper/%s", dev_name);
+	return (access(fullname, F_OK) >= 0);
+}
+
+/* TODO: check the value for DM_DEV_NAME_LEN, DM_TYPE_LEN, DM_PARAMS_LEN */
+uint64_t dm_device_create(struct bl_volume *vols, int num_vols)
+{
+	uint64_t size, stripe_unit, dev = 0;
+	unsigned int count = dev_count;
+	int volnum, i, pos;
+	struct bl_volume *node;
+	char *tmp;
+	struct bl_dm_table *table = NULL;
+	struct bl_dm_table *bl_table_head = NULL;
+	unsigned int len;
+	char *dev_name = NULL;
+
+	/* Create pseudo device here */
+	for (volnum = 0; volnum < num_vols; volnum++) {
+		node = &vols[volnum];
+		switch (node->bv_type) {
+		case BLOCK_VOLUME_SIMPLE:
+			/* Do not need to create device here */
+			dev = node->param.bv_dev;
+			goto continued;
+		case BLOCK_VOLUME_SLICE:
+			table = bl_dm_table_alloc();
+			if (!table)
+				goto out;
+			table->offset = 0;
+			table->size = node->bv_size;
+			strcpy(table->target_type, "linear");
+			if (!TYPE_HAS_DEV(node->bv_vols[0]->bv_type)) {
+				free(table);
+				goto out;
+			}
+			dev = node->bv_vols[0]->param.bv_dev;
+			tmp = table->params;
+			if (!dm_format_dev(tmp, DM_PARAMS_LEN,
+					   MAJOR(dev), MINOR(dev))) {
+				free(table);
+				goto out;
+			}
+			tmp += strlen(tmp);
+			sprintf(tmp, " %lu", node->param.bv_offset);
+			add_to_bl_dm_table(&bl_table_head, table);
+			break;
+		case BLOCK_VOLUME_STRIPE:
+			table = bl_dm_table_alloc();
+			if (!table)
+				goto out;
+			table->offset = 0;
+			/* Truncate size to a stripe unit boundary */
+			stripe_unit = node->param.bv_stripe_unit;
+			table->size =
+			    node->bv_size - (node->bv_size % stripe_unit);
+			strcpy(table->target_type, "striped");
+			sprintf(table->params, "%d %llu %n", node->bv_vol_n,
+				(long long unsigned) stripe_unit, &pos);
+			/* Copy subdev major:minor to params */
+			tmp = table->params + pos;
+			len = DM_PARAMS_LEN - pos;
+			for (i = 0; i < node->bv_vol_n; i++) {
+				if (!TYPE_HAS_DEV(node->bv_vols[i]->bv_type)) {
+					free(table);
+					goto out;
+				}
+				dev = node->bv_vols[i]->param.bv_dev;
+				if (!dm_format_dev(tmp, len, MAJOR(dev),
+						   MINOR(dev))) {
+					free(table);
+					goto out;
+				}
+				pos = strlen(tmp);
+				tmp += pos;
+				len -= pos;
+				sprintf(tmp, " %d ", 0);
+				tmp += 3;
+				len -= 3;
+			}
+			add_to_bl_dm_table(&bl_table_head, table);
+			break;
+		case BLOCK_VOLUME_CONCAT:
+			size = 0;
+			for (i = 0; i < node->bv_vol_n; i++) {
+				table = bl_dm_table_alloc();
+				if (!table)
+					goto out;
+				table->offset = size;
+				table->size = node->bv_vols[i]->bv_size;
+				if (!TYPE_HAS_DEV(node->bv_vols[i]->bv_type)) {
+					free(table);
+					goto out;
+				}
+				strcpy(table->target_type, "linear");
+				tmp = table->params;
+				dev = node->bv_vols[i]->param.bv_dev;
+				if (!dm_format_dev(tmp, DM_PARAMS_LEN,
+						   MAJOR(dev), MINOR(dev))) {
+					free(table);
+					goto out;
+				}
+				tmp += strlen(tmp);
+				sprintf(tmp, " %d", 0);
+				size += table->size;
+				add_to_bl_dm_table(&bl_table_head, table);
+			}
+			break;
+		default:
+			/* Delete previous temporary devices */
+			dm_devicelist_remove(count, dev_count);
+			goto out;
+		}		/* end of swtich */
+		/* Create dev_name here. Name of device is pnfs_vol_XXX */
+		if (dev_name)
+			free(dev_name);
+		dev_name = (char *)calloc(DM_DEV_NAME_LEN, sizeof(char));
+		if (!dev_name) {
+			BL_LOG_ERR("%s: Out of memory\n", __func__);
+			goto out;
+		}
+		do {
+			snprintf(dev_name, DM_DEV_NAME_LEN, dm_name,
+				 dev_count++);
+		} while (dm_device_exists(dev_name));
+
+		dev = dm_device_create_mapped(dev_name, bl_table_head);
+		BL_LOG_INFO("%s: %d %s %d:%d\n", __func__, volnum, dev_name,
+			    (int) MAJOR(dev), (int) MINOR(dev));
+		if (!dev) {
+			/* Delete previous temporary devices */
+			dm_devicelist_remove(count, dev_count);
+			goto out;
+		}
+		node->param.bv_dev = dev;
+		/* TODO: extend use with PSEUDO later */
+		node->bv_type = BLOCK_VOLUME_PSEUDO;
+
+ continued:
+		if (bl_table_head)
+			bl_dm_table_free(bl_table_head);
+		bl_table_head = NULL;
+	}
+ out:
+	if (bl_table_head) {
+		bl_dm_table_free(bl_table_head);
+		bl_table_head = NULL;
+	}
+	if (dev)
+		bl_dm_create_tree(dev);
+	if (dev_name)
+		free(dev_name);
+	return dev;
+}
diff --git a/utils/blkmapd/etc/initd/initd.redhat b/utils/blkmapd/etc/initd/initd.redhat
new file mode 100644
index 0000000..d6a77e8
--- /dev/null
+++ b/utils/blkmapd/etc/initd/initd.redhat
@@ -0,0 +1,76 @@
+#!/bin/sh
+#
+# description: Starts and stops the iSCSI initiator
+#
+# processname: blkmapd
+# pidfile: /var/run/blkmapd.pid
+# config:  /etc/blkmapd.conf
+
+# Source function library.
+if [ -f /etc/init.d/functions ] ; then
+	. /etc/init.d/functions
+elif [ -f /etc/rc.d/init.d/functions ] ; then
+	. /etc/rc.d/init.d/functions
+else
+	exit 0
+fi
+
+PATH=/sbin:/bin:/usr/sbin:/usr/bin
+
+RETVAL=0
+
+start()
+{
+	echo -n $"Starting pNFS block-layout device discovery service: "
+	modprobe -q blocklayoutdriver
+	daemon /usr/sbin/blkmapd
+	RETVAL=$?
+	if [ $RETVAL -eq 0 ]; then
+		touch /var/lock/subsys/blkmapd
+	fi
+	echo
+	return $RETVAL
+}
+
+stop()
+{
+	echo -n $"Stopping pNFS block-layout device discovery service: "
+	killproc blkmapd 2> /dev/null
+	rm -f /var/run/blkmapd.pid
+	RETVAL=$?
+	[ $RETVAL -eq 0 ] && rm -f /var/lock/subsys/blkmapd
+	if [ $RETVAL -eq 0 ]; then
+                echo_success
+        else
+                echo_failure
+        fi
+	echo
+	return $RETVAL
+}
+
+restart()
+{
+	stop
+	start
+}
+
+case "$1" in
+	start)
+		start
+		;;
+	stop)
+		stop
+		;;
+	restart)
+		stop
+		start
+		;;
+	status)
+		status blkmapd
+		;;
+	*)
+	echo $"Usage: $0 {start|stop|restart|status}"
+	exit 1
+esac
+
+exit $RETVAL
-- 
1.7.4.1

--
To unsubscribe from this list: send the line "unsubscribe linux-nfs" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux