From: Jeff Layton <jlayton@xxxxxxxxxx> Add a new clustered RADOS recovery backend driver. This uses a common RADOS object to coordinate a cluster-wide grace period. We use the hostname as a persistent nodeid. Change-Id: Ic1ec91f5df7c5cbfa5254c646757b2b29e434dfb Signed-off-by: Jeff Layton <jlayton@xxxxxxxxxx> --- src/SAL/CMakeLists.txt | 3 +- src/SAL/nfs4_recovery.c | 7 +- src/SAL/recovery/recovery_rados_cluster.c | 406 ++++++++++++++++++++++ src/doc/man/ganesha-core-config.rst | 1 + src/include/sal_functions.h | 1 + 5 files changed, 413 insertions(+), 5 deletions(-) create mode 100644 src/SAL/recovery/recovery_rados_cluster.c diff --git a/src/SAL/CMakeLists.txt b/src/SAL/CMakeLists.txt index 115ff04c97ad..8af718949b98 100644 --- a/src/SAL/CMakeLists.txt +++ b/src/SAL/CMakeLists.txt @@ -38,6 +38,7 @@ if(USE_RADOS_RECOV) ${sal_STAT_SRCS} recovery/recovery_rados_kv.c recovery/recovery_rados_ng.c + recovery/recovery_rados_cluster.c ) endif(USE_RADOS_RECOV) @@ -46,7 +47,7 @@ add_sanitizers(sal) if(USE_RADOS_RECOV) include_directories(${RADOS_INCLUDE_DIR}) - target_link_libraries(sal ${RADOS_LIBRARIES}) + target_link_libraries(sal rados_grace ${RADOS_LIBRARIES}) endif(USE_RADOS_RECOV) ########### install files ############### diff --git a/src/SAL/nfs4_recovery.c b/src/SAL/nfs4_recovery.c index a0bd986ef247..22524903e17c 100644 --- a/src/SAL/nfs4_recovery.c +++ b/src/SAL/nfs4_recovery.c @@ -220,11 +220,8 @@ bool nfs_in_grace(void) */ void nfs_maybe_start_grace(void) { - if (recovery_backend->maybe_start_grace) { - if (nfs_in_grace()) - return; + if (recovery_backend->maybe_start_grace) recovery_backend->maybe_start_grace(); - } } /** @@ -485,6 +482,8 @@ static int load_backend(const char *name) rados_kv_backend_init(&recovery_backend); else if (!strcmp(name, "rados_ng")) rados_ng_backend_init(&recovery_backend); + else if (!strcmp(name, "rados_cluster")) + rados_cluster_backend_init(&recovery_backend); #endif else if (!strcmp(name, "fs_ng")) fs_ng_backend_init(&recovery_backend); diff --git a/src/SAL/recovery/recovery_rados_cluster.c b/src/SAL/recovery/recovery_rados_cluster.c new file mode 100644 index 000000000000..1e0c25109c70 --- /dev/null +++ b/src/SAL/recovery/recovery_rados_cluster.c @@ -0,0 +1,406 @@ +/* + * vim:noexpandtab:shiftwidth=8:tabstop=8: + * + * Copyright 2017 Red Hat, Inc. and/or its affiliates. + * Author: Jeff Layton <jlayton@xxxxxxxxxx> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 3 of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this library; if not, write to the Free Software + * Foundation, Inc., + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + * + * recovery_rados_cluster: a clustered recovery backing store + * + * We assume that each node has a unique nodeid, with a corresponding slot in + * the grace omap, and a rados_kv store for each server epoch. + * + * When the grace period is started, call into the rados_grace infrastructure + * to determine whether we're in a grace period and from what epoch we're + * allowed to recover state. Set the proper oid strings, and load the recovery + * db if applicable, and start a new one (or clear the old). + * + * Similarly, when we want to lift the grace period we'll call down into the + * rados_grace engine to clear its NEED_GRACE flag and see whether anyone else + * still needs one. If they do, we return appropriately so that the local + * grace period is not lifted, and we just re-poll for it again later. + * + * When one node needs a grace period, all nodes are obligated to begin grace + * enforcement as soon as possible. Note that they needn't allow any recovery, + * and any existing state can still be used. The important bit is that they do + * not allow the establishment of new state. This is done on each node by + * enabling the local grace period. + * + * Finally, set and clear the enforcing flag appropriately as we change the + * local grace period. When the driver notes that all nodes are in an enforcing + * state, it'll notify all exported FSALs. This allows them to safely clear + * old state held by a previous instance from the backend filesystem in order + * to handle recovery. + */ + +#include "config.h" +#include <netdb.h> +#include <rados/librados.h> +#include <rados_grace.h> +#include "log.h" +#include "nfs_core.h" +#include "sal_functions.h" +#include "recovery_rados.h" + +/* FIXME: Make this configurable -- RADOS_KV param? */ +#define RADOS_GRACE_OID "grace" + +/* Use hostname as nodeid in cluster */ +char *nodeid; +static uint64_t rados_watch_cookie; + +static void rados_grace_watchcb(void *arg, uint64_t notify_id, uint64_t handle, + uint64_t notifier_id, void *data, + size_t data_len) +{ + int ret; + + /* ACK it first, so we keep things moving along */ + ret = rados_notify_ack(rados_recov_io_ctx, RADOS_GRACE_OID, notify_id, + rados_watch_cookie, NULL, 0); + if (ret < 0) + LogEvent(COMPONENT_CLIENTID, + "rados_notify_ack failed: %d", ret); + + /* Now kick the reaper to check things out */ + nfs_notify_grace_waiters(); + reaper_wake(); +} + +static void rados_cluster_init(void) +{ + int ret; + long maxlen; + + maxlen = sysconf(_SC_HOST_NAME_MAX); + nodeid = gsh_malloc(maxlen); + ret = gethostname(nodeid, maxlen); + if (ret) { + LogEvent(COMPONENT_CLIENTID, "gethostname failed: %d", errno); + goto out_free_nodeid; + } + + ret = rados_kv_connect(&rados_recov_io_ctx, rados_kv_param.userid, + rados_kv_param.ceph_conf, rados_kv_param.pool); + if (ret < 0) { + LogEvent(COMPONENT_CLIENTID, + "Failed to connect to cluster: %d", ret); + goto out_shutdown; + } + + ret = rados_grace_create(rados_recov_io_ctx, RADOS_GRACE_OID); + if (ret < 0 && ret != -EEXIST) { + LogEvent(COMPONENT_CLIENTID, + "Failed to create grace db: %d", ret); + goto out_shutdown; + } + + /* FIXME: not sure about the 30s timeout value here */ + ret = rados_watch3(rados_recov_io_ctx, RADOS_GRACE_OID, + &rados_watch_cookie, rados_grace_watchcb, NULL, + 30, NULL); + if (ret < 0) { + LogEvent(COMPONENT_CLIENTID, + "Failed to set watch on grace db: %d", ret); + goto out_shutdown; + } + return; + +out_shutdown: + rados_kv_shutdown(); +out_free_nodeid: + gsh_free(nodeid); + nodeid = NULL; +} + +/* Try to delete old recovery db */ +static void rados_cluster_end_grace(void) +{ + int ret; + rados_write_op_t wop; + uint64_t cur, rec; + + if (rados_recov_old_oid[0] == '\0') + return; + + ret = rados_grace_enforcing_off(rados_recov_io_ctx, RADOS_GRACE_OID, + nodeid, &cur, &rec); + if (ret) + LogEvent(COMPONENT_CLIENTID, + "Failed to set grace off for %s: %d", nodeid, ret); + + wop = rados_create_write_op(); + rados_write_op_remove(wop); + ret = rados_write_op_operate(wop, rados_recov_io_ctx, + rados_recov_old_oid, NULL, 0); + if (ret) + LogEvent(COMPONENT_CLIENTID, "Failed to remove %s: %d", + rados_recov_old_oid, ret); + + memset(rados_recov_old_oid, '\0', sizeof(rados_recov_old_oid)); +} + +static void rados_cluster_read_clids(nfs_grace_start_t *gsp, + add_clid_entry_hook add_clid_entry, + add_rfh_entry_hook add_rfh_entry) +{ + int ret; + uint64_t cur, rec; + rados_write_op_t wop; + struct pop_args args = { + .add_clid_entry = add_clid_entry, + .add_rfh_entry = add_rfh_entry, + }; + + if (gsp) { + LogEvent(COMPONENT_CLIENTID, + "Clustered rados backend does not support takeover!"); + return; + } + + /* Start or join a grace period */ + ret = rados_grace_join(rados_recov_io_ctx, RADOS_GRACE_OID, + nodeid, &cur, &rec, true); + if (ret) { + LogEvent(COMPONENT_CLIENTID, + "Failed to join grace period: %d", ret); + return; + } + + /* + * Recovery db names are "rec-cccccccccccccccc:hostname" + * + * "rec-" followed by epoch in 16 hex digits + nodeid. + */ + snprintf(rados_recov_oid, sizeof(rados_recov_oid), + "rec-%16.16lx:%s", cur, nodeid); + wop = rados_create_write_op(); + rados_write_op_create(wop, LIBRADOS_CREATE_IDEMPOTENT, NULL); + rados_write_op_omap_clear(wop); + ret = rados_write_op_operate(wop, rados_recov_io_ctx, + rados_recov_oid, NULL, 0); + rados_release_write_op(wop); + if (ret < 0) { + LogEvent(COMPONENT_CLIENTID, "Failed to create recovery db"); + return; + }; + + snprintf(rados_recov_old_oid, sizeof(rados_recov_old_oid), + "rec-%16.16lx:%s", rec, nodeid); + ret = rados_kv_traverse(rados_kv_pop_clid_entry, &args, + rados_recov_old_oid); + if (ret < 0) + LogEvent(COMPONENT_CLIENTID, + "Failed to traverse recovery db: %d", ret); +} + +static bool rados_cluster_try_lift_grace(void) +{ + int ret; + uint64_t cur, rec; + + ret = rados_grace_lift(rados_recov_io_ctx, RADOS_GRACE_OID, + nodeid, &cur, &rec); + if (ret) { + LogEvent(COMPONENT_CLIENTID, + "Attempt to lift grace failed: %d", ret); + return false; + } + + /* Non-zero rec means grace is still in force */ + return (rec == 0); +} + +struct rados_cluster_kv_pairs { + size_t slots; /* Current array size */ + size_t num; /* Count of populated elements */ + char **keys; /* Array of key strings */ + char **vals; /* Array of value blobs */ + size_t *lens; /* Array of value lengths */ +}; + +/* + * FIXME: Since each hash tree is protected by its own mutex, we can't ensure + * that we'll get an accurate count before allocating. For now, we just + * have a fixed-size cap of 1024 entries in the db, but we should allow + * there to be an arbitrary number of entries. + */ +#define RADOS_KV_STARTING_SLOTS 1024 + +static void rados_set_client_cb(struct rbt_node *pn, void *arg) +{ + struct hash_data *addr = RBT_OPAQ(pn); + nfs_client_id_t *clientid = addr->val.addr; + struct rados_cluster_kv_pairs *kvp = arg; + char ckey[RADOS_KEY_MAX_LEN]; + char cval[RADOS_VAL_MAX_LEN]; + + /* FIXME: resize arrays in this case? */ + if (kvp->num >= kvp->slots) { + LogEvent(COMPONENT_CLIENTID, "too many clients to copy!"); + return; + } + + rados_kv_create_key(clientid, ckey); + rados_kv_create_val(clientid, cval); + + kvp->keys[kvp->num] = strdup(ckey); + kvp->vals[kvp->num] = strdup(cval); + kvp->lens[kvp->num] = strlen(cval); + ++kvp->num; +} + +/** + * @brief Start local grace period if we're in a global one + * + * In clustered setups, other machines in the cluster can start a new + * grace period. Check for that and enter the grace period if so. + */ +static void rados_cluster_maybe_start_grace(void) +{ + int ret, i; + nfs_grace_start_t gsp = { .event = EVENT_JUST_GRACE }; + rados_write_op_t wop; + uint64_t cur, rec; + char *keys[RADOS_KV_STARTING_SLOTS]; + char *vals[RADOS_KV_STARTING_SLOTS]; + size_t lens[RADOS_KV_STARTING_SLOTS]; + struct rados_cluster_kv_pairs kvp = { + .slots = RADOS_KV_STARTING_SLOTS, + .num = 0, + .keys = keys, + .vals = vals, + .lens = lens }; + + + /* Do nothing if we're already enforcing grace period */ + if (nfs_in_grace()) + return; + + /* Fix up the strings */ + ret = rados_grace_epochs(rados_recov_io_ctx, RADOS_GRACE_OID, + &cur, &rec); + if (ret) { + LogEvent(COMPONENT_CLIENTID, "rados_grace_epochs failed: %d", + ret); + return; + } + + /* No grace period if rec == 0 */ + if (rec == 0) + return; + + /* Start a new grace period */ + nfs_start_grace(&gsp); + + snprintf(rados_recov_oid, sizeof(rados_recov_oid), + "rec-%16.16lx:%s", cur, nodeid); + snprintf(rados_recov_old_oid, sizeof(rados_recov_old_oid), + "rec-%16.16lx:%s", rec, nodeid); + + /* Populate key/val/len arrays from confirmed client hash */ + hashtable_for_each(ht_confirmed_client_id, rados_set_client_cb, &kvp); + + /* Create new write op and package it up for callback */ + wop = rados_create_write_op(); + rados_write_op_create(wop, LIBRADOS_CREATE_IDEMPOTENT, NULL); + rados_write_op_omap_clear(wop); + rados_write_op_omap_set(wop, (char const * const *)keys, + (char const * const *)vals, + (const size_t *)lens, kvp.num); + ret = rados_write_op_operate(wop, rados_recov_io_ctx, + rados_recov_oid, NULL, 0); + if (ret) + LogEvent(COMPONENT_CLIENTID, + "rados_write_op_operate failed: %d", ret); + + rados_release_write_op(wop); + + /* Free copied strings */ + for (i = 0; i < kvp.num; ++i) { + free(kvp.keys[i]); + free(kvp.vals[i]); + } +} + +static void rados_cluster_shutdown(void) +{ + int ret; + uint64_t cur, rec; + + /* + * Request grace on clean shutdown to minimize the chance that we'll + * miss the window and the MDS kills off the old session. + * + * FIXME: only do this if our key is in the omap, and we have a + * non-empty recovery db. + */ + ret = rados_grace_join(rados_recov_io_ctx, RADOS_GRACE_OID, + nodeid, &cur, &rec, true); + if (ret) + LogEvent(COMPONENT_CLIENTID, + "Failed to start grace period on shutdown: %d", ret); + + ret = rados_unwatch2(rados_recov_io_ctx, rados_watch_cookie); + if (ret) + LogEvent(COMPONENT_CLIENTID, + "Failed to unwatch grace db: %d", ret); + + rados_kv_shutdown(); + gsh_free(nodeid); + nodeid = NULL; +} + +static void rados_cluster_set_enforcing(void) +{ + int ret; + uint64_t cur, rec; + + ret = rados_grace_enforcing_on(rados_recov_io_ctx, RADOS_GRACE_OID, + nodeid, &cur, &rec); + if (ret) + LogEvent(COMPONENT_CLIENTID, + "Failed to set enforcing for %s: %d", nodeid, ret); +} + +static bool rados_cluster_grace_enforcing(void) +{ + int ret; + + ret = rados_grace_enforcing_check(rados_recov_io_ctx, RADOS_GRACE_OID); + LogEvent(COMPONENT_CLIENTID, "%s: ret=%d", __func__, ret); + return (ret == 0); +} + +struct nfs4_recovery_backend rados_cluster_backend = { + .recovery_init = rados_cluster_init, + .recovery_shutdown = rados_cluster_shutdown, + .recovery_read_clids = rados_cluster_read_clids, + .end_grace = rados_cluster_end_grace, + .add_clid = rados_kv_add_clid, + .rm_clid = rados_kv_rm_clid, + .add_revoke_fh = rados_kv_add_revoke_fh, + .maybe_start_grace = rados_cluster_maybe_start_grace, + .try_lift_grace = rados_cluster_try_lift_grace, + .set_enforcing = rados_cluster_set_enforcing, + .grace_enforcing = rados_cluster_grace_enforcing, +}; + +void rados_cluster_backend_init(struct nfs4_recovery_backend **backend) +{ + *backend = &rados_cluster_backend; +} diff --git a/src/doc/man/ganesha-core-config.rst b/src/doc/man/ganesha-core-config.rst index 6453e800262f..eb8d9bd09f2c 100644 --- a/src/doc/man/ganesha-core-config.rst +++ b/src/doc/man/ganesha-core-config.rst @@ -292,6 +292,7 @@ RecoveryBackend(path, default "fs") - fs_ng: filesystem (better resiliency) - rados_kv : rados key-value - rados_ng : rados key-value (better resiliency) + - rados_cluster: clustered rados backend (active/active) Minor_Versions(enum list, values [0, 1, 2], default [0, 1, 2]) List of supported NFSV4 minor version numbers. diff --git a/src/include/sal_functions.h b/src/include/sal_functions.h index 708290495731..a53fa475c1bb 100644 --- a/src/include/sal_functions.h +++ b/src/include/sal_functions.h @@ -1038,6 +1038,7 @@ void fs_ng_backend_init(struct nfs4_recovery_backend **); int rados_kv_set_param_from_conf(config_file_t, struct config_error_type *); void rados_kv_backend_init(struct nfs4_recovery_backend **); void rados_ng_backend_init(struct nfs4_recovery_backend **); +void rados_cluster_backend_init(struct nfs4_recovery_backend **backend); #endif #endif /* SAL_FUNCTIONS_H */ -- 2.17.0 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html