I am reposting this patch with the following changes: 1) Updated patch for the Makefile changes, assuming dm-log-cluster.patch precedes this patch 2) Added Documentation file, Documentation/dm-exception-store.txt, that describes exception store implementation and various types. brassow This patch provides an exception store implementation that is capable of "wrapping" other exception store implementations and making them cluster-aware. It is not a stand-alone implementation. It merely uses distributed locking to protect exception store metadata as the single-machine "core" exception stores perform their actions independently. This is why the module uses the term "clusterized" instead of "clustered". Signed-off-by: Jonathan Brassow <jbrassow@xxxxxxxxxx> Index: linux-2.6/drivers/md/Kconfig =================================================================== --- linux-2.6.orig/drivers/md/Kconfig +++ linux-2.6/drivers/md/Kconfig @@ -244,10 +244,23 @@ config DM_CRYPT If unsure, say N. config DM_SNAPSHOT - tristate "Snapshot target" - depends on BLK_DEV_DM - ---help--- - Allow volume managers to take writable snapshots of a device. + tristate "Snapshot target" + depends on BLK_DEV_DM + ---help--- + Allow volume managers to take writable snapshots of a device. + +config DM_EXSTORE_CLUSTERIZED + tristate "Cluster-aware exception store wrapper (EXPERIMENTAL)" + depends on BLK_DEV_DM && DM_SNAPSHOT && EXPERIMENTAL + select DLM + ---help--- + An exception store is a module that is used by snapshots to + record COW areas. This module is capable of wrapping certain + exception stores so that they appear to be cluster-aware. This + has the affect of making device-mapper snapshots cluster-aware. + Not every exception store type can be wrapped. Check the end + of drivers/md/dm-ex-store-clusterized.c to find out what stores + are supported. config DM_MIRROR tristate "Mirror target" Index: linux-2.6/drivers/md/Makefile =================================================================== --- linux-2.6.orig/drivers/md/Makefile +++ linux-2.6/drivers/md/Makefile @@ -7,6 +7,7 @@ dm-mod-objs := dm.o dm-table.o dm-target dm-multipath-objs := dm-path-selector.o dm-mpath.o dm-snapshot-objs := dm-snap.o dm-exception.o dm-exception-store.o \ dm-snap-persistent.o dm-snap-transient.o +dm-exstore-clusterized-objs := dm-ex-store-clusterized.o dm-mirror-objs := dm-raid1.o dm-log-clustered-objs := dm-log-cluster.o dm-log-cluster-transfer.o md-mod-objs := md.o bitmap.o @@ -36,6 +37,7 @@ obj-$(CONFIG_DM_CRYPT) += dm-crypt.o obj-$(CONFIG_DM_DELAY) += dm-delay.o obj-$(CONFIG_DM_MULTIPATH) += dm-multipath.o dm-round-robin.o obj-$(CONFIG_DM_SNAPSHOT) += dm-snapshot.o +obj-$(CONFIG_DM_EXSTORE_CLUSTERIZED) += dm-exstore-clusterized.o obj-$(CONFIG_DM_MIRROR) += dm-mirror.o dm-log.o dm-region-hash.o obj-$(CONFIG_DM_LOG_CLUSTERED) += dm-log-clustered.o obj-$(CONFIG_DM_ZERO) += dm-zero.o Index: linux-2.6/drivers/md/dm-ex-store-clusterized.c =================================================================== --- /dev/null +++ linux-2.6/drivers/md/dm-ex-store-clusterized.c @@ -0,0 +1,573 @@ +/* + * Copyright (C) 2009 Red Hat, Inc. All rights reserved. + * + * Device-mapper exception structure and associated functions. + * + * This file is released under the GPL. + */ +#include <linux/device-mapper.h> +#include <linux/dlm.h> +#include "dm-exception-store.h" + +#define DM_MSG_PREFIX "clusterized exception store" + +struct clusterized_c { + struct dm_exception_store *core_store; + + struct rw_semaphore lock; + + int current_dl_mode; + unsigned dl_holders; + struct completion dlm_completion; + dlm_lockspace_t *lockspace; + struct dlm_lksb lksb; + + uint64_t metadata_counter; + uint64_t cluster_metadata_counter; + + char uuid[0]; /* must be last */ +}; + +#define mode2str(__mode) \ + (__mode == DLM_LOCK_NL) ? "DLM_LOCK_NL" : \ + (__mode == DLM_LOCK_CR) ? "DLM_LOCK_CR" : \ + (__mode == DLM_LOCK_EX) ? "DLM_LOCK_EX" : "UNKNOWN" + +static void lock_obtained(void *context) +{ + struct clusterized_c *cc = context; + + complete(&cc->dlm_completion); +} + +static int __cluster_lock(struct clusterized_c *cc, int mode) +{ + int r; + uint32_t flags = DLM_LKF_VALBLK; + + /* + * Does the curernt lock mode meet or exceed our needs? + */ + if (mode <= cc->current_dl_mode) { + cc->dl_holders++; + return 0; + } + + /* + * If the current lock mode is DLM_LOCK_NL, we can immediately + * proceed to converting it. + */ + if (cc->current_dl_mode == DLM_LOCK_NL) { + BUG_ON(cc->dl_holders); + + flags |= DLM_LKF_CONVERT; + + r = dlm_lock(cc->lockspace, mode, &cc->lksb, + flags, cc->uuid, strlen(cc->uuid), 0, + lock_obtained, cc, NULL); + if (r) { + DMERR("cluster_lock immediate failure: %d", r); + return r; + } + + wait_for_completion(&cc->dlm_completion); + + if (cc->lksb.sb_status) { + DMERR("cluster_lock async failure: %d", + cc->lksb.sb_status); + return cc->lksb.sb_status; + } + + cc->current_dl_mode = mode; + cc->dl_holders = 1; + return 0; + } + + DMERR("DLM up-conversion required... waiting for unlock"); + return -EAGAIN; +} + +static int __cluster_unlock(struct clusterized_c *cc) +{ + int r; + uint32_t flags = DLM_LKF_VALBLK | DLM_LKF_CONVERT; + + if (--cc->dl_holders) + return 0; + + r = dlm_lock(cc->lockspace, DLM_LOCK_NL, &cc->lksb, + flags, cc->uuid, strlen(cc->uuid), 0, + lock_obtained, cc, NULL); + + if (!r) { + wait_for_completion(&cc->dlm_completion); + r = cc->lksb.sb_status; + } + + if (r) + DMERR("Failure to convert to NL lock: %d", r); + else + cc->current_dl_mode = DLM_LOCK_NL; + + return r; +} + +static int cluster_lock_init(struct clusterized_c *cc) +{ + int r; + uint32_t flags = DLM_LKF_VALBLK | DLM_LKF_EXPEDITE; + + r = dlm_lock(cc->lockspace, DLM_LOCK_NL, &cc->lksb, + flags, cc->uuid, strlen(cc->uuid), 0, + lock_obtained, cc, NULL); + if (!r) { + wait_for_completion(&cc->dlm_completion); + r = cc->lksb.sb_status; + } + + if (r) + DMERR("Failed to acquire initial DLM lock: %d", r); + + return r; +} + +static void cluster_lock_exit(struct clusterized_c *cc) +{ + BUG_ON(cc->current_dl_mode); + dlm_unlock(cc->lockspace, cc->lksb.sb_lkid, + DLM_LKF_FORCEUNLOCK, &cc->lksb, cc); +} + +static int cluster_lock(struct clusterized_c *cc, int mode) +{ + int r = -EAGAIN; + + while (r == -EAGAIN) { + down_write(&cc->lock); + r = __cluster_lock(cc, mode); + up_write(&cc->lock); + } + + return r; +} + +/* + * cluster_unlock + * @cc + * + * Doesn't completely unlock, but rather puts the lock back into + * the DLM_LOCK_NL mode. This preserves the LVB. + * + */ +static int cluster_unlock(struct clusterized_c *cc) +{ + int r; + + down_write(&cc->lock); + r = __cluster_unlock(cc); + up_write(&cc->lock); + + return r; +} + +/* + * clusterized_ctr + * @store + * @argc + * @argv + * + * The mapping table will be the same as the exception + * store it is covering, but will also include the + * argument: + * <non-clustered args> cluster_uuid:<UUID> + * + * Returns: 0 on success, -EXXX on failure + */ +static int clusterized_ctr(struct dm_exception_store *store, + unsigned argc, char **argv) +{ + int r; + unsigned i, j, len; + unsigned my_argc = argc + 1; + char *my_argv[my_argc]; + char chunk_size_str[32]; + char *core_name; + struct clusterized_c *cc = NULL; + + /* + * First, in order to pass down to non-clustered + * core, we must add back the COW and chunk size + * arguments + */ + my_argv[0] = store->cow->name; + sprintf(chunk_size_str, "%llu", (unsigned long long)store->chunk_size); + my_argv[1] = chunk_size_str; + + /* Now we strip off the cluster_uuid argument */ + argc--; + if (strncmp("cluster_uuid:", argv[argc], 13)) { + DMERR("No 'cluster_uuid:' argument provided."); + return -EINVAL; + } + for (i = 0, j = 2; i < argc; i++, j++) + my_argv[j] = argv[i]; + + /* + * We just want to count the actual UUID, plus 1 + * for the trailing NULL. (With MAX size being + * what is able to fit in the LVB of a DLM lock.) + */ + len = strlen(argv[argc] + 13) + 1; + len = (len > DLM_RESNAME_MAXLEN) ? DLM_RESNAME_MAXLEN : len; + cc = kzalloc(sizeof(*cc) + len, GFP_KERNEL); + if (!cc) + return -ENOMEM; + strncpy(cc->uuid, argv[argc] + 13, len); + cc->lksb.sb_lvbptr = (char *)&cc->cluster_metadata_counter; + + init_rwsem(&cc->lock); + init_completion(&cc->dlm_completion); + + /* Create (or join) the lock space */ + r = dlm_new_lockspace(store->type->name, strlen(store->type->name), + &cc->lockspace, 0, sizeof(uint64_t)); + + if (r) { + DMERR("Unable to create DLM lockspace for %s", + store->type->name); + kfree(cc); + return r; + } + cluster_lock_init(cc); + + /* + * Now we find the non-clustered exception store name. + * It will be whatever is left when we strip 'clusterized_' off. + */ + core_name = strstr(store->type->name, "-"); + BUG_ON(!core_name); + core_name++; + + r = dm_exception_store_create(core_name, store->ti, my_argc, my_argv, + &cc->core_store); + + if (r) { + DMERR("Failed to create foundational exception store, %s", + core_name); + dlm_release_lockspace(cc->lockspace, 1); + kfree(cc); + return r; + } + + /* If the core store is shared, we are shared */ + store->shared_uuid = cc->core_store->shared_uuid; + + store->context = cc; + + return 0; +} + +static void clusterized_dtr(struct dm_exception_store *store) +{ + struct clusterized_c *cc = store->context; + + dm_exception_store_destroy(cc->core_store); + + cluster_lock_exit(cc); + dlm_release_lockspace(cc->lockspace, 1); + + kfree(cc); +} + +static int clusterized_resume(struct dm_exception_store *store) +{ + int r; + struct clusterized_c *cc = store->context; + + cluster_lock(cc, DLM_LOCK_CR); + + r = cc->core_store->type->resume(cc->core_store); + cc->metadata_counter = cc->cluster_metadata_counter; + + cluster_unlock(cc); + + return r; +} + +static void clusterized_presuspend(struct dm_exception_store *store) +{ + struct clusterized_c *cc = store->context; + + if (cc->core_store->type->presuspend) + cc->core_store->type->presuspend(store); +} + +static void clusterized_postsuspend(struct dm_exception_store *store) +{ + struct clusterized_c *cc = store->context; + + if (cc->core_store->type->postsuspend) + cc->core_store->type->postsuspend(store); +} + +static int clusterized_prepare_exception(struct dm_exception_store *store, + struct dm_exception *e, int group) +{ + int r; + struct clusterized_c *cc = store->context; + + cluster_lock(cc, DLM_LOCK_EX); + + r = cc->core_store->type->prepare_exception(cc->core_store, e, group); + + if (r) { + DMERR("Core store failed to prepare_exception"); + cluster_unlock(cc); + } + + return r; +} + +/* cbc - callback context */ +struct cbc { + struct clusterized_c *cc; + + void (*callback) (void *, int success); + void *callback_data; +}; + +void commit_callback(void *data, int success) +{ + struct cbc *context = data; + + context->cc->metadata_counter++; + context->cc->cluster_metadata_counter = context->cc->metadata_counter; + + context->callback(context->callback_data, success); + cluster_unlock(context->cc); + + kfree(context); +} + +static void clusterized_commit_exception(struct dm_exception_store *store, + struct dm_exception *e, + void (*callback) (void *, int success), + void *callback_context) +{ + struct clusterized_c *cc = store->context; + struct cbc *cbc; + + cbc = kmalloc(sizeof(*cbc), GFP_NOIO); + if (!cbc) { + callback(callback_context, 0); + return; + } + + cbc->cc = cc; + cbc->callback = callback; + cbc->callback_data = callback_context; + + cc->core_store->type->commit_exception(cc->core_store, e, + commit_callback, cbc); +} + +/* + * clusterized_lookup_exception + * @store + * @old + * @new: NULL if they don't want data back + * @group + * @can_block + * + * A "shared" exception store can alter the metadata + * outside the scope of our cluster-wide LVB counter. + * We have no way of knowing whether we need to re-read/resume + * the metadata if a "shared" exception store is in use. + * + * We could re-read the metadata regardless, but that seems + * like an aweful waste... just don't allow "shared" + * exception stores right now (enforced in the ctr). + * + * Returns: 0 if found, -ENOENT if not found, -Exxx otherwise + */ +static int clusterized_lookup_exception(struct dm_exception_store *store, + chunk_t old, chunk_t *new, + int group, int can_block) +{ + int r; + struct clusterized_c *cc = store->context; + + /* + * Even if the metadata counters don't match, we don't + * need to re-read the metadata if we can find the + * exception right now. In fact, we don't even need to + * take out the cluster lock if we are just looking in our + * local cache. + */ + r = cc->core_store->type->lookup_exception(cc->core_store, old, + new, group, can_block); + + /* If we found the exception or there was an error, we can return */ + if (r != -ENOENT) + return r; + + /* We block when we aquire the DLM lock - respect !can_block */ + if (!can_block) + return -EWOULDBLOCK; + + cluster_lock(cc, DLM_LOCK_CR); + + /* + * If a "shared" core exception store is used, then the + * metadata_counter is incapable of keeping track of all + * changes that occur, so we must re-read the metadata + * (i.e. resume). + */ + if (!store->shared_uuid && + (cc->cluster_metadata_counter == cc->metadata_counter)) { + /* + * Exception was not found, and the metadata was not + * changed by other node. + */ + cluster_unlock(cc); + return -ENOENT; + } + + /* + * The core exception store's resume method must be capable of + * re-reading its metadata and updating its cache. IOW, it must + * be able to resume multiple times before a suspend is issued. + */ + cc->core_store->type->resume(cc->core_store); + + cc->metadata_counter = cc->cluster_metadata_counter; + cluster_unlock(cc); + + /* Now, try to find the exception again. */ + r = cc->core_store->type->lookup_exception(cc->core_store, old, + new, group, can_block); + return r; +} + +static void clusterized_fraction_full(struct dm_exception_store *store, + sector_t *numerator, sector_t *denominator) +{ + struct clusterized_c *cc = store->context; + + /* + * FIXME: If we want more exact numbers, then we should + * check the LVB for changes and potentially force the + * core store to re-read metadata. + */ + cc->core_store->type->fraction_full(cc->core_store, numerator, + denominator); +} + +static unsigned clusterized_status(struct dm_exception_store *store, + status_type_t status, char *result, + unsigned int maxlen) +{ + int sz = 0; + char *tmp_result; + struct clusterized_c *cc = store->context; + + switch (status) { + case STATUSTYPE_INFO: + break; + case STATUSTYPE_TABLE: + DMEMIT(" clusterized"); + tmp_result = result + sz; + sz += cc->core_store->type->status(cc->core_store, status, + result+sz, maxlen-sz); + tmp_result[0] = '-'; /* s/ /-/ */ + + tmp_result = strstr(tmp_result, " "); + if (tmp_result) { + tmp_result++; + tmp_result[0]++; /* Inc numeric char */ + } + + DMEMIT(" cluster_uuid:%s", cc->uuid); + } + + return sz; +} + +static int clusterized_message(struct dm_exception_store *store, + unsigned argc, char **argv) +{ + int r; + struct clusterized_c *cc = store->context; + + cluster_lock(cc, DLM_LOCK_EX); + + r = cc->core_store->type->message(cc->core_store, argc, argv); + + cc->metadata_counter++; + cc->cluster_metadata_counter = cc->metadata_counter; + cluster_unlock(cc); + + return r; +} + +/* + * Here is where we define what core exception store types are + * valid for this module to clusterize. The necessary qualities + * of the core exception store are: + * 1) Must be able to resume multiple times (i.e. re-read + * its metadata). This is because other nodes are allowed + * to add/alter the metadata underneath you. Ideally, only + * the delta's will be picked up when the metadata is + * re-read - as is the case with the "persistent" store. + * *2) Must not be a "shared" exception store. IOW, the alteration + * of one exception store cannot affect another. Currently, this + * situation is not adequately handled (but could be handled if + * people really want it). + * + * If the above conditions are met, then you can simply add an addtional + * 'dm_exception_store_type' below. In fact, you could copy the block of + * code that is there and replace 'persistent' with the name of the + * exception store type that is being covered. + */ +static struct dm_exception_store_type _clusterized_persistent = { + .name = "clusterized-persistent", + .module = THIS_MODULE, + .ctr = clusterized_ctr, + .dtr = clusterized_dtr, + .resume = clusterized_resume, + .presuspend = clusterized_presuspend, + .postsuspend = clusterized_postsuspend, + .prepare_exception = clusterized_prepare_exception, + .commit_exception = clusterized_commit_exception, + .lookup_exception = clusterized_lookup_exception, + .fraction_full = clusterized_fraction_full, + .status = clusterized_status, + .message = clusterized_message, +}; + +static int __init dm_clusterized_exception_store_init(void) +{ + int r; + + r = dm_exception_store_type_register(&_clusterized_persistent); + if (r) + DMERR("Unable to register clusterized-persistent" + " exception store type: %d", r); + else + DMINFO("(built %s %s) installed", __DATE__, __TIME__); + + return r; +} + +static void __exit dm_clusterized_exception_store_exit(void) +{ + dm_exception_store_type_unregister(&_clusterized_persistent); + DMINFO("(built %s %s) removed", __DATE__, __TIME__); +} + +module_init(dm_clusterized_exception_store_init); +module_exit(dm_clusterized_exception_store_exit); + +MODULE_DESCRIPTION(DM_MSG_PREFIX); +MODULE_AUTHOR("Jonathan Brassow <jbrassow@xxxxxxxxxx>"); +MODULE_LICENSE("GPL"); Index: linux-2.6/Documentation/dm-exception-store.txt =================================================================== --- /dev/null +++ linux-2.6/Documentation/dm-exception-store.txt @@ -0,0 +1,45 @@ +Device-Mapper Exception Store +============================= +The device-mapper exception store code is used by device-mapper +snapshots (although other targets could find it useful as well). +The exception stores provide a way to map the old location of a +chunk (a discrete portion of the storage space) to a new location. +This remapping information is called an "exception". Snapshots +use this to track their Copy-On-Write data. + +There is a generic exception store interface. Various different +exception store implementations are available and have vastly +different characteristics. The list includes: + +Type File(s) +==== ======= +persistent drivers/md/dm-snap-persistent.c +P (depricated) drivers/md/dm-snap-persistent.c +transient drivers/md/dm-snap-transient.c +N (depricated) drivers/md/dm-snap-transient.c +clusterized-persistent drivers/md/dm-ex-store-clusterized.c + +The "persistent" type +--------------------- +No fancy algorithms or space efficiency considerations. Just +a simple way to store exceptions to disk. The exception store +data can survive reboots and crashes. The "P" type is simply +the old name for this exception store type. + +The "transient" type +-------------------- +No fancy algorithms or space efficiency considerations. Just +a simple way to store exceptions in memory. The exception store +data /will not/ survive reboots and crashes. The "N" type is simply +the old name for this exception store type. + +The "clusterized-persistent" type +--------------------------------- +This implementation merely provides DLM wrapping around other available +types. This is why the term 'clusterized-' is used instead of 'cluster-' +or 'clustered-'. This implementation has the ability to wrap most +present and future exception store implementations, although "persistent" +is currently the only one supported. This wrapping makes the exception +stores cluster-aware, which in turn makes device-mapper snapshots cluster +aware. + -- dm-devel mailing list dm-devel@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/dm-devel