[PATCH 1/2] Add support for Ceph rbd backing store

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Ceph is distributed storage clustering software (http://ceph.com).
This allows tgtd to use the Ceph "RADOS block device" (rbd) as
backing store for a LUN.

Ceph storage is divided into pools, and each rbd image can have
multiple readonly snapshots, so the parameters to fully specify
an image include poolname, imagename, and snapshot name.

--bstype is 'rbd'
--backing-store path is [pool/]imagename[@snapname]
Up to 20 simultaneous images supported
All I/O is synchronous, 16 threads in worker pool by default
Based on the 'rdwr' bs driver

Signed-off-by: Dan Mick <dan.mick@xxxxxxxxxxx>
---
 usr/Makefile    |    2 +
 usr/bs.c        |    3 +
 usr/bs_rbd.c    |  536 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 usr/bs_rdwr.c   |    2 -
 usr/bs_thread.h |    2 +-
 5 files changed, 542 insertions(+), 3 deletions(-)
 create mode 100644 usr/bs_rbd.c

diff --git a/usr/Makefile b/usr/Makefile
index 64cb58c..e257791 100644
--- a/usr/Makefile
+++ b/usr/Makefile
@@ -12,6 +12,8 @@ TGTD_OBJS += $(addprefix iscsi/, conn.o param.o session.o \
 		iscsid.o target.o chap.o sha1.o md5.o transport.o iscsi_tcp.o \
 		isns.o)
 TGTD_OBJS += bs_rdwr.o
+TGTD_OBJS += bs_rbd.o
+LIBS += -lrados -lrbd
 
 ifneq ($(shell test -e /usr/include/sys/eventfd.h && test -e /usr/include/libaio.h && echo 1),)
 CFLAGS += -DUSE_EVENTFD
diff --git a/usr/bs.c b/usr/bs.c
index e2faf30..65c332e 100644
--- a/usr/bs.c
+++ b/usr/bs.c
@@ -44,6 +44,9 @@ LIST_HEAD(bst_list);
 static LIST_HEAD(finished_list);
 static pthread_mutex_t finished_lock;
 
+/* used by both bs_rdwr.c and bs_rbd.c */
+int nr_iothreads = 16;
+
 static int sig_fd = -1;
 
 static int command_fd[2];
diff --git a/usr/bs_rbd.c b/usr/bs_rbd.c
new file mode 100644
index 0000000..727130e
--- /dev/null
+++ b/usr/bs_rbd.c
@@ -0,0 +1,536 @@
+/*
+ * Synchronous rbd image backing store routine
+ *
+ * modified from bs_rdrw.c:
+ * Copyright (C) 2006-2007 FUJITA Tomonori <tomof@xxxxxxx>
+ * Copyright (C) 2006-2007 Mike Christie <michaelc@xxxxxxxxxxx>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation, version 2 of the
+ * License.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301 USA
+ */
+#define _XOPEN_SOURCE 600
+
+#include <errno.h>
+#include <fcntl.h>
+#include <inttypes.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include <linux/fs.h>
+#include <sys/epoll.h>
+
+#include "list.h"
+#include "util.h"
+#include "tgtd.h"
+#include "scsi.h"
+#include "spc.h"
+#include "bs_thread.h"
+
+#include "rados/librados.h"
+#include "rbd/librbd.h"
+
+/* one cluster connection only */
+rados_t cluster;
+
+struct active_rbd {
+	char *poolname;
+	char *imagename;
+	char *snapname;
+	rados_ioctx_t ioctx;
+	rbd_image_t rbd_image;
+};
+
+#define MAX_IMAGES	20
+struct active_rbd active_rbds[MAX_IMAGES];
+
+#define RBDP(fd)	(&active_rbds[fd])
+
+static void parse_imagepath(char *path, char **pool, char **image, char **snap)
+{
+	char *origp = strdup(path);
+	char *p, *sep;
+
+	p = origp;
+	sep = strchr(p, '/');
+	if (sep == NULL) {
+		*pool = "rbd";
+	} else {
+		*sep = '\0';
+		*pool = strdup(p);
+		p = sep + 1;
+	}
+	/* p points to image[@snap] */
+	sep = strchr(p, '@');
+	if (sep == NULL) {
+		*snap = "";
+	} else {
+		*snap = strdup(sep + 1);
+		*sep = '\0';
+	}
+	/* p points to image\0 */
+	*image = strdup(p);
+	free(origp);
+}
+
+static void set_medium_error(int *result, uint8_t *key, uint16_t *asc)
+{
+	*result = SAM_STAT_CHECK_CONDITION;
+	*key = MEDIUM_ERROR;
+	*asc = ASC_READ_ERROR;
+}
+
+static void bs_sync_sync_range(struct scsi_cmd *cmd, uint32_t length,
+			       int *result, uint8_t *key, uint16_t *asc)
+{
+	int ret;
+
+	ret = rbd_flush(RBDP(cmd->dev->fd)->rbd_image);
+	if (ret)
+		set_medium_error(result, key, asc);
+}
+
+static void bs_rbd_request(struct scsi_cmd *cmd)
+{
+	int ret;
+	uint32_t length;
+	int result = SAM_STAT_GOOD;
+	uint8_t key;
+	uint16_t asc;
+#if 0
+	/*
+	 * This should go in the sense data on error for COMPARE_AND_WRITE, but
+	 * there doesn't seem to be any attempt to do so...
+	 */
+
+	uint32_t info = 0;
+#endif
+	char *tmpbuf;
+	size_t blocksize;
+	uint64_t offset = cmd->offset;
+	uint32_t tl     = cmd->tl;
+	int do_verify = 0;
+	int i;
+	char *ptr;
+	const char *write_buf = NULL;
+	ret = length = 0;
+	key = asc = 0;
+	struct active_rbd *rbd = RBDP(cmd->dev->fd);
+
+	switch (cmd->scb[0]) {
+	case ORWRITE_16:
+		length = scsi_get_out_length(cmd);
+
+		tmpbuf = malloc(length);
+		if (!tmpbuf) {
+			result = SAM_STAT_CHECK_CONDITION;
+			key = HARDWARE_ERROR;
+			asc = ASC_INTERNAL_TGT_FAILURE;
+			break;
+		}
+
+		ret = rbd_read(rbd->rbd_image, offset, length, tmpbuf);
+
+		if (ret != length) {
+			set_medium_error(&result, &key, &asc);
+			free(tmpbuf);
+			break;
+		}
+
+		ptr = scsi_get_out_buffer(cmd);
+		for (i = 0; i < length; i++)
+			ptr[i] |= tmpbuf[i];
+
+		free(tmpbuf);
+
+		write_buf = scsi_get_out_buffer(cmd);
+		goto write;
+	case COMPARE_AND_WRITE:
+		/* Blocks are transferred twice, first the set that
+		 * we compare to the existing data, and second the set
+		 * to write if the compare was successful.
+		 */
+		length = scsi_get_out_length(cmd) / 2;
+		if (length != cmd->tl) {
+			result = SAM_STAT_CHECK_CONDITION;
+			key = ILLEGAL_REQUEST;
+			asc = ASC_INVALID_FIELD_IN_CDB;
+			break;
+		}
+
+		tmpbuf = malloc(length);
+		if (!tmpbuf) {
+			result = SAM_STAT_CHECK_CONDITION;
+			key = HARDWARE_ERROR;
+			asc = ASC_INTERNAL_TGT_FAILURE;
+			break;
+		}
+
+		ret = rbd_read(rbd->rbd_image, offset, length, tmpbuf);
+
+		if (ret != length) {
+			set_medium_error(&result, &key, &asc);
+			free(tmpbuf);
+			break;
+		}
+
+		if (memcmp(scsi_get_out_buffer(cmd), tmpbuf, length)) {
+			uint32_t pos = 0;
+			char *spos = scsi_get_out_buffer(cmd);
+			char *dpos = tmpbuf;
+
+			/*
+			 * Data differed, this is assumed to be 'rare'
+			 * so use a much more expensive byte-by-byte
+			 * comparasion to find out at which offset the
+			 * data differs.
+			 */
+			for (pos = 0; pos < length && *spos++ == *dpos++;
+			     pos++)
+				;
+#if 0
+			/* See comment above at declaration */
+			info = pos;
+#endif
+			result = SAM_STAT_CHECK_CONDITION;
+			key = MISCOMPARE;
+			asc = ASC_MISCOMPARE_DURING_VERIFY_OPERATION;
+			free(tmpbuf);
+			break;
+		}
+
+		/* no DPO bit (cache retention advice) support */
+		free(tmpbuf);
+
+		write_buf = scsi_get_out_buffer(cmd) + length;
+		goto write;
+	case SYNCHRONIZE_CACHE:
+	case SYNCHRONIZE_CACHE_16:
+		/* TODO */
+		length = (cmd->scb[0] == SYNCHRONIZE_CACHE) ? 0 : 0;
+
+		if (cmd->scb[1] & 0x2) {
+			result = SAM_STAT_CHECK_CONDITION;
+			key = ILLEGAL_REQUEST;
+			asc = ASC_INVALID_FIELD_IN_CDB;
+		} else
+			bs_sync_sync_range(cmd, length, &result, &key, &asc);
+		break;
+	case WRITE_VERIFY:
+	case WRITE_VERIFY_12:
+	case WRITE_VERIFY_16:
+		do_verify = 1;
+	case WRITE_6:
+	case WRITE_10:
+	case WRITE_12:
+	case WRITE_16:
+		length = scsi_get_out_length(cmd);
+		write_buf = scsi_get_out_buffer(cmd);
+write:
+		ret = rbd_write(rbd->rbd_image, offset, length, write_buf);
+		if (ret == length) {
+			struct mode_pg *pg;
+
+			/*
+			 * it would be better not to access to pg
+			 * directy.
+			 */
+			pg = find_mode_page(cmd->dev, 0x08, 0);
+			if (pg == NULL) {
+				result = SAM_STAT_CHECK_CONDITION;
+				key = ILLEGAL_REQUEST;
+				asc = ASC_INVALID_FIELD_IN_CDB;
+				break;
+			}
+			if (((cmd->scb[0] != WRITE_6) && (cmd->scb[1] & 0x8)) ||
+			    !(pg->mode_data[0] & 0x04))
+				bs_sync_sync_range(cmd, length, &result, &key,
+						   &asc);
+		} else
+			set_medium_error(&result, &key, &asc);
+
+		if (do_verify)
+			goto verify;
+		break;
+	case WRITE_SAME:
+	case WRITE_SAME_16:
+		/* WRITE_SAME used to punch hole in file */
+		if (cmd->scb[1] & 0x08) {
+			ret = rbd_discard(rbd->rbd_image, offset, tl);
+			if (ret != 0) {
+				eprintf("Failed to punch hole for WRITE_SAME"
+					" command\n");
+				result = SAM_STAT_CHECK_CONDITION;
+				key = HARDWARE_ERROR;
+				asc = ASC_INTERNAL_TGT_FAILURE;
+				break;
+			}
+			break;
+		}
+		while (tl > 0) {
+			blocksize = 1 << cmd->dev->blk_shift;
+			tmpbuf = scsi_get_out_buffer(cmd);
+
+			switch (cmd->scb[1] & 0x06) {
+			case 0x02: /* PBDATA==0 LBDATA==1 */
+				put_unaligned_be32(offset, tmpbuf);
+				break;
+			case 0x04: /* PBDATA==1 LBDATA==0 */
+				/* physical sector format */
+				put_unaligned_be64(offset, tmpbuf);
+				break;
+			}
+
+			ret = rbd_write(rbd->rbd_image, offset, blocksize,
+					tmpbuf);
+			if (ret != blocksize)
+				set_medium_error(&result, &key, &asc);
+
+			offset += blocksize;
+			tl     -= blocksize;
+		}
+		break;
+	case READ_6:
+	case READ_10:
+	case READ_12:
+	case READ_16:
+		length = scsi_get_in_length(cmd);
+		ret = rbd_read(rbd->rbd_image, offset, length,
+			       scsi_get_in_buffer(cmd));
+
+		if (ret != length)
+			set_medium_error(&result, &key, &asc);
+
+		break;
+	case PRE_FETCH_10:
+	case PRE_FETCH_16:
+		break;
+	case VERIFY_10:
+	case VERIFY_12:
+	case VERIFY_16:
+verify:
+		length = scsi_get_out_length(cmd);
+
+		tmpbuf = malloc(length);
+		if (!tmpbuf) {
+			result = SAM_STAT_CHECK_CONDITION;
+			key = HARDWARE_ERROR;
+			asc = ASC_INTERNAL_TGT_FAILURE;
+			break;
+		}
+
+		ret = rbd_read(rbd->rbd_image, offset, length, tmpbuf);
+
+		if (ret != length)
+			set_medium_error(&result, &key, &asc);
+		else if (memcmp(scsi_get_out_buffer(cmd), tmpbuf, length)) {
+			result = SAM_STAT_CHECK_CONDITION;
+			key = MISCOMPARE;
+			asc = ASC_MISCOMPARE_DURING_VERIFY_OPERATION;
+		}
+
+		free(tmpbuf);
+		break;
+	case UNMAP:
+		if (!cmd->dev->attrs.thinprovisioning) {
+			result = SAM_STAT_CHECK_CONDITION;
+			key = ILLEGAL_REQUEST;
+			asc = ASC_INVALID_FIELD_IN_CDB;
+			break;
+		}
+
+		length = scsi_get_out_length(cmd);
+		tmpbuf = scsi_get_out_buffer(cmd);
+
+		if (length < 8)
+			break;
+
+		length -= 8;
+		tmpbuf += 8;
+
+		while (length >= 16) {
+			offset = get_unaligned_be64(&tmpbuf[0]);
+			offset = offset << cmd->dev->blk_shift;
+
+			tl = get_unaligned_be32(&tmpbuf[8]);
+			tl = tl << cmd->dev->blk_shift;
+
+			if (offset + tl > cmd->dev->size) {
+				eprintf("UNMAP beyond EOF\n");
+				result = SAM_STAT_CHECK_CONDITION;
+				key = ILLEGAL_REQUEST;
+				asc = ASC_LBA_OUT_OF_RANGE;
+				break;
+			}
+
+			if (tl > 0) {
+				if (rbd_discard(rbd->rbd_image, offset, tl)
+				    != 0) {
+					eprintf("Failed to punch hole for"
+						" UNMAP at offset:%" PRIu64
+						" length:%d\n",
+						offset, tl);
+					result = SAM_STAT_CHECK_CONDITION;
+					key = HARDWARE_ERROR;
+					asc = ASC_INTERNAL_TGT_FAILURE;
+					break;
+				}
+			}
+
+			length -= 16;
+			tmpbuf += 16;
+		}
+		break;
+	default:
+		break;
+	}
+
+	dprintf("io done %p %x %d %u\n", cmd, cmd->scb[0], ret, length);
+
+	scsi_set_result(cmd, result);
+
+	if (result != SAM_STAT_GOOD) {
+		eprintf("io error %p %x %d %d %" PRIu64 ", %m\n",
+			cmd, cmd->scb[0], ret, length, offset);
+		sense_data_build(cmd, key, asc);
+	}
+}
+
+
+static int bs_rbd_open(struct scsi_lu *lu, char *path, int *fd, uint64_t *size)
+{
+	uint32_t blksize = 0;
+	int ret;
+	rbd_image_info_t inf;
+	char *poolname;
+	char *imagename;
+	char *snapname;
+	struct active_rbd *rbd = NULL;
+	int lfd;
+
+	parse_imagepath(path, &poolname, &imagename, &snapname);
+	for (lfd = 0; lfd < MAX_IMAGES; lfd++) {
+		if (active_rbds[lfd].rbd_image == NULL) {
+			rbd = &active_rbds[lfd];
+			*fd = lfd;
+			break;
+		}
+	}
+	if (!rbd) {
+		*fd = -1;
+		return -EMFILE;
+	}
+
+	rbd->poolname = poolname;
+	rbd->imagename = imagename;
+	rbd->snapname = snapname;
+	eprintf("bs_rbd_open: pool: %s image: %s snap: %s\n",
+		poolname, imagename, snapname);
+
+	if ((ret == rados_ioctx_create(cluster, poolname, &rbd->ioctx)) < 0) {
+		eprintf("bs_rbd_open: rados_ioctx_create: %d\n", ret);
+		return -EIO;
+	}
+	/* null snap name */
+	ret = rbd_open(rbd->ioctx, imagename, &rbd->rbd_image, snapname);
+	if (ret < 0) {
+		eprintf("bs_rbd_open: rbd_open: %d\n", ret);
+		return ret;
+	}
+	if (rbd_stat(rbd->rbd_image, &inf, sizeof(inf)) < 0) {
+		eprintf("bs_rbd_open: rbd_stat: %d\n", ret);
+		return ret;
+	}
+	*size = inf.size;
+	blksize = inf.obj_size;
+
+	if (!lu->attrs.no_auto_lbppbe)
+		update_lbppbe(lu, blksize);
+
+	return 0;
+}
+
+static void bs_rbd_close(struct scsi_lu *lu)
+{
+	struct active_rbd *rbd = RBDP(lu->fd);
+
+	if (rbd->rbd_image) {
+		rbd_close(rbd->rbd_image);
+		rados_ioctx_destroy(rbd->ioctx);
+		rbd->rbd_image = rbd->ioctx = NULL;
+	}
+}
+
+static tgtadm_err bs_rbd_init(struct scsi_lu *lu)
+{
+	tgtadm_err ret = TGTADM_UNKNOWN_ERR;
+	int rados_ret;
+	struct bs_thread_info *info = BS_THREAD_I(lu);
+
+	rados_ret = rados_create(&cluster, NULL);
+	if (rados_ret < 0) {
+		eprintf("bs_rbd_init: rados_create: %d\n", rados_ret);
+		return ret;
+	}
+	/* read config from environment and then default files */
+	rados_ret = rados_conf_parse_env(cluster, NULL);
+	if (rados_ret < 0) {
+		eprintf("bs_rbd_init: rados_conf_parse_env: %d\n", rados_ret);
+		goto fail;
+	}
+	rados_ret = rados_conf_read_file(cluster, NULL);
+	if (rados_ret < 0) {
+		eprintf("bs_rbd_init: rados_conf_read_file: %d\n", rados_ret);
+		goto fail;
+	}
+	rados_ret = rados_connect(cluster);
+	if (rados_ret < 0) {
+		eprintf("bs_rbd_init: rados_connect: %d\n", rados_ret);
+		goto fail;
+	}
+	ret = bs_thread_open(info, bs_rbd_request, nr_iothreads);
+	if (ret == TGTADM_SUCCESS)
+		return ret;
+fail:
+	rados_shutdown(&cluster);
+	return ret;
+}
+
+static void bs_rbd_exit(struct scsi_lu *lu)
+{
+	struct bs_thread_info *info = BS_THREAD_I(lu);
+
+	bs_thread_close(info);
+	rados_shutdown(&cluster);
+}
+
+static struct backingstore_template rbd_bst = {
+	.bs_name		= "rbd",
+	.bs_datasize		= sizeof(struct bs_thread_info),
+	.bs_open		= bs_rbd_open,
+	.bs_close		= bs_rbd_close,
+	.bs_init		= bs_rbd_init,
+	.bs_exit		= bs_rbd_exit,
+	.bs_cmd_submit		= bs_thread_cmd_submit,
+	.bs_oflags_supported    = O_SYNC | O_DIRECT,
+};
+
+static __attribute__((constructor)) void bs_rbd_constructor(void)
+{
+	register_backingstore_template(&rbd_bst);
+}
diff --git a/usr/bs_rdwr.c b/usr/bs_rdwr.c
index b59fe7b..47d2d99 100644
--- a/usr/bs_rdwr.c
+++ b/usr/bs_rdwr.c
@@ -398,8 +398,6 @@ static void bs_rdwr_close(struct scsi_lu *lu)
 	close(lu->fd);
 }
 
-int nr_iothreads = 16;
-
 static tgtadm_err bs_rdwr_init(struct scsi_lu *lu)
 {
 	struct bs_thread_info *info = BS_THREAD_I(lu);
diff --git a/usr/bs_thread.h b/usr/bs_thread.h
index beb4c3f..a7e4063 100644
--- a/usr/bs_thread.h
+++ b/usr/bs_thread.h
@@ -27,4 +27,4 @@ extern tgtadm_err bs_thread_open(struct bs_thread_info *info, request_func_t *rf
 				 int nr_threads);
 extern void bs_thread_close(struct bs_thread_info *info);
 extern int bs_thread_cmd_submit(struct scsi_cmd *cmd);
-
+extern int nr_iothreads;
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe stgt" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Index of Archives]     [Linux SCSI]     [Linux RAID]     [Linux Clusters]     [Linux USB Devel]     [Linux Audio Users]     [Yosemite News]     [Linux Kernel]

  Powered by Linux