Re: [nfs-utils PATCH RFC 3/7] nfsdcld: a few enhancements

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Tue, 2018-11-06 at 13:36 -0500, Scott Mayhew wrote:
> 1) Adopt the concept of "reboot epochs" (but not coordinated grace
> periods via the "need" and "enforcing" flags) from Jeff Layton's
> "Active/Active NFS Server Recovery" presentation from the Fall 2018 NFS
> Bakeathon.  See
> http://nfsv4bat.org/Documents/BakeAThon/2018/Active_Active%20NFS%20Server%20Recovery.pdf
> 
> - add a new table "grace" which contains two integer columns
>   representing the "current" epoch (where new client records are stored)
>   and the "recovery" epoch (which has the records for clients that are
>   allowed to recover)
> - replace the "clients" table with table(s) named "rec-CCCCCCCCCCCCCCCC"
>   (where C is the hex value of the epoch), containing a single column
>   "id" which stores the client id string
> - when going from normal operation into grace, the current epoch becomes
>   the recovery epoch, the current epoch is incremented, and a new table
>   is created for the current epoch.  Clients are allowed to reclaim if
>   they have a record in the table corresponding to the recovery epoch
>   and new records are added to the table corresponding to the current
>   epoch.
> - when moving from grace back to normal operation, the table associated
>   with the recovery epoch is deleted and the recovery epoch becomes
>   zero.
> - if the server restarts before exiting the previous grace period, then
>   the epochs are not changed, and all records in the table associated
>   with the "current" epoch are cleared out.
> 
> 2) Allow knfsd to "slurp" the client records during startup.
> 
> During client tracking initialization, knfsd will do an upcall to get a
> list of clients from the database.  nfsdcld will do one downcall with a
> status of -EINPROGRESS for each client record in the database, followed
> by a final downcall with a status of 0.  This will allow 2 things
> 
> - knfsd can check whether a client is allowed to reclaim without
>   performing an upcall to nfsdcld
> - knfsd can decide to end the grace period early by tracking the number
>   of RECLAIM_COMPLETE operations it receives from "known" clients, or
>   it can skip the grace period altogether if no clients are allowed
>   to reclaim.
> 


Thanks for doing this work, Scott. This should give us an even more
robust recovery backend that is suitable for containerization, and
possibly something we could extend to do active/active clustered NFS
properly with knfsd.

The changes look great overall -- one minor thing inline below:

> Signed-off-by: Scott Mayhew <smayhew@xxxxxxxxxx>
> ---
>  support/include/cld.h        |   1 +
>  utils/nfsdcld/Makefile.am    |   2 +-
>  utils/nfsdcld/cld-internal.h |  30 +++
>  utils/nfsdcld/nfsdcld.c      | 160 +++++++++++-
>  utils/nfsdcld/sqlite.c       | 483 ++++++++++++++++++++++++++++-------
>  utils/nfsdcld/sqlite.h       |  11 +-
>  6 files changed, 579 insertions(+), 108 deletions(-)
>  create mode 100644 utils/nfsdcld/cld-internal.h
> 
> diff --git a/support/include/cld.h b/support/include/cld.h
> index f14a9ab..c1f5b70 100644
> --- a/support/include/cld.h
> +++ b/support/include/cld.h
> @@ -33,6 +33,7 @@ enum cld_command {
>  	Cld_Remove,		/* remove record of this cm_id */
>  	Cld_Check,		/* is this cm_id allowed? */
>  	Cld_GraceDone,		/* grace period is complete */
> +	Cld_GraceStart,
>  };
>  
>  /* representation of long-form NFSv4 client ID */
> diff --git a/utils/nfsdcld/Makefile.am b/utils/nfsdcld/Makefile.am
> index 8239be8..d1da749 100644
> --- a/utils/nfsdcld/Makefile.am
> +++ b/utils/nfsdcld/Makefile.am
> @@ -13,7 +13,7 @@ sbin_PROGRAMS	= nfsdcld
>  nfsdcld_SOURCES = nfsdcld.c sqlite.c
>  nfsdcld_LDADD = ../../support/nfs/libnfs.la $(LIBEVENT) $(LIBSQLITE) $(LIBCAP)
>  
> -noinst_HEADERS	= sqlite.h
> +noinst_HEADERS	= sqlite.h cld-internal.h
>  
>  MAINTAINERCLEANFILES = Makefile.in
>  
> diff --git a/utils/nfsdcld/cld-internal.h b/utils/nfsdcld/cld-internal.h
> new file mode 100644
> index 0000000..a90cced
> --- /dev/null
> +++ b/utils/nfsdcld/cld-internal.h
> @@ -0,0 +1,30 @@
> +/*
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License
> + * as published by the Free Software Foundation; either version 2
> + * of the License, or (at your option) any later version.
> + *
> + * This program is distributed in the hope that it will be useful,
> + * but WITHOUT ANY WARRANTY; without even the implied warranty of
> + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
> + * GNU General Public License for more details.
> + *
> + * You should have received a copy of the GNU General Public License
> + * along with this program; if not, write to the Free Software
> + * Foundation, Inc., 51 Franklin Street, Fifth Floor,
> + * Boston, MA 02110-1301, USA.
> + */
> +
> +#ifndef _CLD_INTERNAL_H_
> +#define _CLD_INTERNAL_H_
> +
> +struct cld_client {
> +	int			cl_fd;
> +	struct event		cl_event;
> +	struct cld_msg	cl_msg;
> +};
> +
> +uint64_t current_epoch;
> +uint64_t recovery_epoch;
> +
> +#endif /* _CLD_INTERNAL_H_ */
> diff --git a/utils/nfsdcld/nfsdcld.c b/utils/nfsdcld/nfsdcld.c
> index 082f3ab..9b1ad98 100644
> --- a/utils/nfsdcld/nfsdcld.c
> +++ b/utils/nfsdcld/nfsdcld.c
> @@ -42,7 +42,9 @@
>  #include "xlog.h"
>  #include "nfslib.h"
>  #include "cld.h"
> +#include "cld-internal.h"
>  #include "sqlite.h"
> +#include "../mount/version.h"
>  
>  #ifndef PIPEFS_DIR
>  #define PIPEFS_DIR NFS_STATEDIR "/rpc_pipefs"
> @@ -54,19 +56,17 @@
>  #define CLD_DEFAULT_STORAGEDIR NFS_STATEDIR "/nfsdcld"
>  #endif
>  
> +#define NFSD_END_GRACE_FILE "/proc/fs/nfsd/v4_end_grace"
> +
>  #define UPCALL_VERSION		1
>  
>  /* private data structures */
> -struct cld_client {
> -	int			cl_fd;
> -	struct event		cl_event;
> -	struct cld_msg	cl_msg;
> -};
>  
>  /* global variables */
>  static char *pipepath = DEFAULT_CLD_PATH;
>  static int 		inotify_fd = -1;
>  static struct event	pipedir_event;
> +static bool old_kernel = false;
>  
>  static struct option longopts[] =
>  {
> @@ -298,6 +298,43 @@ out:
>  	return ret;
>  }
>  
> +/*
> + * Older kernels will not tell nfsdcld when a grace period has started.
> + * Therefore we have to peek at the /proc/fs/nfsd/v4_end_grace file to
> + * see if nfsd is in grace.  We have to do this for create and remove
> + * upcalls to ensure that the correct table is being updated - otherwise
> + * we could lose client records when the grace period is lifted.
> + */
> +static int
> +cld_check_grace_period(void)
> +{
> +	int fd, ret = 0;
> +	char c;
> +
> +	if (!old_kernel)
> +		return 0;
> +	if (recovery_epoch != 0)
> +		return 0;
> +	fd = open(NFSD_END_GRACE_FILE, O_RDONLY);
> +	if (fd < 0) {
> +		xlog(L_WARNING, "Unable to open %s: %m",
> +			NFSD_END_GRACE_FILE);
> +		return 1;
> +	}
> +	if (read(fd, &c, 1) < 0) {
> +		xlog(L_WARNING, "Unable to read from %s: %m",
> +			NFSD_END_GRACE_FILE);
> +		return 1;
> +	}
> +	close(fd);
> +	if (c == 'N') {
> +		xlog(L_WARNING, "nfsd is in grace but didn't send a gracestart upcall, "
> +			"please update the kernel");
> +		ret = sqlite_grace_start();
> +	}
> +	return ret;
> +}
> +
>  static void
>  cld_not_implemented(struct cld_client *clnt)
>  {
> @@ -332,14 +369,17 @@ cld_create(struct cld_client *clnt)
>  	ssize_t bsize, wsize;
>  	struct cld_msg *cmsg = &clnt->cl_msg;
>  
> +	ret = cld_check_grace_period();
> +	if (ret)
> +		goto reply;
> +
>  	xlog(D_GENERAL, "%s: create client record.", __func__);
>  
>  
>  	ret = sqlite_insert_client(cmsg->cm_u.cm_name.cn_id,
> -				   cmsg->cm_u.cm_name.cn_len,
> -				   false,
> -				   false);
> +				   cmsg->cm_u.cm_name.cn_len);
>  
> +reply:
>  	cmsg->cm_status = ret ? -EREMOTEIO : ret;
>  
>  	bsize = sizeof(*cmsg);
> @@ -365,11 +405,16 @@ cld_remove(struct cld_client *clnt)
>  	ssize_t bsize, wsize;
>  	struct cld_msg *cmsg = &clnt->cl_msg;
>  
> +	ret = cld_check_grace_period();
> +	if (ret)
> +		goto reply;
> +
>  	xlog(D_GENERAL, "%s: remove client record.", __func__);
>  
>  	ret = sqlite_remove_client(cmsg->cm_u.cm_name.cn_id,
>  				   cmsg->cm_u.cm_name.cn_len);
>  
> +reply:
>  	cmsg->cm_status = ret ? -EREMOTEIO : ret;
>  
>  	bsize = sizeof(*cmsg);
> @@ -396,12 +441,26 @@ cld_check(struct cld_client *clnt)
>  	ssize_t bsize, wsize;
>  	struct cld_msg *cmsg = &clnt->cl_msg;
>  
> +	/*
> +	 * If we get a check upcall at all, it means we're talking to an old
> +	 * kernel.  Furthermore, if we're not in grace it means this is the
> +	 * first client to do a reclaim.  Log a message and use
> +	 * sqlite_grace_start() to advance the epoch numbers.
> +	 */
> +	if (recovery_epoch == 0) {
> +		xlog(D_GENERAL, "%s: received a check upcall, please update the kernel",
> +			__func__);
> +		ret = sqlite_grace_start();
> +		if (ret)
> +			goto reply;
> +	}
> +
>  	xlog(D_GENERAL, "%s: check client record", __func__);
>  
>  	ret = sqlite_check_client(cmsg->cm_u.cm_name.cn_id,
> -				  cmsg->cm_u.cm_name.cn_len,
> -				  false);
> +				  cmsg->cm_u.cm_name.cn_len);
>  
> +reply:
>  	/* set up reply */
>  	cmsg->cm_status = ret ? -EACCES : ret;
>  
> @@ -429,11 +488,27 @@ cld_gracedone(struct cld_client *clnt)
>  	ssize_t bsize, wsize;
>  	struct cld_msg *cmsg = &clnt->cl_msg;
>  
> -	xlog(D_GENERAL, "%s: grace done. cm_gracetime=%ld", __func__,
> -			cmsg->cm_u.cm_gracetime);
> +	/*
> +	 * If we got a "gracedone" upcall while we're not in grace, then
> +	 * 1) we must be talking to an old kernel
> +	 * 2) no clients attempted to reclaim
> +	 * In that case, log a message and use sqlite_grace_start() to
> +	 * advance the epoch numbers, and then proceed as normal.
> +	 */
> +	if (recovery_epoch == 0) {
> +		xlog(D_GENERAL, "%s: received gracedone upcall "
> +			"while not in grace, please update the kernel",
> +			__func__);
> +		ret = sqlite_grace_start();
> +		if (ret)
> +			goto reply;
> +	}
> +
> +	xlog(D_GENERAL, "%s: grace done.", __func__);
>  
> -	ret = sqlite_remove_unreclaimed(cmsg->cm_u.cm_gracetime);
> +	ret = sqlite_grace_done();
>  
> +reply:
>  	/* set up reply: downcall with 0 status */
>  	cmsg->cm_status = ret ? -EREMOTEIO : ret;
>  
> @@ -453,6 +528,59 @@ cld_gracedone(struct cld_client *clnt)
>  	}
>  }
>  
> +static int
> +gracestart_callback(struct cld_client *clnt) {
> +	ssize_t bsize, wsize;
> +	struct cld_msg *cmsg = &clnt->cl_msg;
> +
> +	cmsg->cm_status = -EINPROGRESS;
> +
> +	bsize = sizeof(struct cld_msg);
> +
> +	xlog(D_GENERAL, "Sending client %.*s",
> +			cmsg->cm_u.cm_name.cn_len, cmsg->cm_u.cm_name.cn_id);
> +	wsize = atomicio((void *)write, clnt->cl_fd, cmsg, bsize);
> +	if (wsize != bsize)
> +		return -EIO;
> +	return 0;
> +}
> +
> +static void
> +cld_gracestart(struct cld_client *clnt)
> +{
> +	int ret;
> +	ssize_t bsize, wsize;
> +	struct cld_msg *cmsg = &clnt->cl_msg;
> +
> +	xlog(D_GENERAL, "%s: updating grace epochs", __func__);
> +
> +	ret = sqlite_grace_start();
> +	if (ret)
> +		goto reply;
> +
> +	xlog(D_GENERAL, "%s: sending client records to the kernel", __func__);
> +
> +	ret = sqlite_iterate_recovery(&gracestart_callback, clnt);
> +
> +reply:
> +	/* set up reply: downcall with 0 status */
> +	cmsg->cm_status = ret ? -EREMOTEIO : ret;
> +
> +	bsize = sizeof(struct cld_msg);
> +	xlog(D_GENERAL, "Doing downcall with status %d", cmsg->cm_status);
> +	wsize = atomicio((void *)write, clnt->cl_fd, cmsg, bsize);
> +	if (wsize != bsize) {
> +		xlog(L_ERROR, "%s: problem writing to cld pipe (%ld): %m",
> +			 __func__, wsize);
> +		ret = cld_pipe_open(clnt);
> +		if (ret) {
> +			xlog(L_FATAL, "%s: unable to reopen pipe: %d",
> +					__func__, ret);
> +			exit(ret);
> +		}
> +	}
> +}
> +
>  static void
>  cldcb(int UNUSED(fd), short which, void *data)
>  {
> @@ -490,6 +618,9 @@ cldcb(int UNUSED(fd), short which, void *data)
>  	case Cld_GraceDone:
>  		cld_gracedone(clnt);
>  		break;
> +	case Cld_GraceStart:
> +		cld_gracestart(clnt);
> +		break;
>  	default:
>  		xlog(L_WARNING, "%s: command %u is not yet implemented",
>  				__func__, cmsg->cm_cmd);
> @@ -586,6 +717,9 @@ main(int argc, char **argv)
>  		}
>  	}
>  
> +	if (linux_version_code() < MAKE_VERSION(4, 20, 0))
> +		old_kernel = true;
> +
>  	/* set up storage db */
>  	rc = sqlite_prepare_dbh(storagedir);
>  	if (rc) {
> diff --git a/utils/nfsdcld/sqlite.c b/utils/nfsdcld/sqlite.c
> index c59f777..67549c9 100644
> --- a/utils/nfsdcld/sqlite.c
> +++ b/utils/nfsdcld/sqlite.c
> @@ -21,17 +21,24 @@
>   * Explanation:
>   *
>   * This file contains the code to manage the sqlite backend database for the
> - * nfsdcltrack usermodehelper upcall program.
> + * nfsdcld client tracking daemon.
>   *
>   * The main database is called main.sqlite and contains the following tables:
>   *
>   * parameters: simple key/value pairs for storing database info
>   *
> - * clients: an "id" column containing a BLOB with the long-form clientid as
> - * 	    sent by the client, a "time" column containing a timestamp (in
> - * 	    epoch seconds) of when the record was last updated, and a
> - * 	    "has_session" column containing a boolean value indicating
> - * 	    whether the client has sessions (v4.1+) or not (v4.0).
> + * grace: a "current" column containing an INTEGER representing the current
> + *        epoch (where should new values be stored) and a "recovery" column
> + *        containing an INTEGER representing the recovery epoch (from what
> + *        epoch are we allowed to recover).  A recovery epoch of 0 means
> + *        normal operation (grace period not in force).  Note: sqlite stores
> + *        integers as signed values, so these must be cast to a uint64_t when
> + *        retrieving them from the database and back to an int64_t when storing
> + *        them in the database.
> + *
> + * rec-CCCCCCCCCCCCCCCC (where C is the hex representation of the epoch value):
> + *        a single "id" column containing a BLOB with the long-form clientid
> + *        as sent by the client.
>   */
>  
>  #ifdef HAVE_CONFIG_H
> @@ -47,16 +54,21 @@
>  #include <sys/types.h>
>  #include <fcntl.h>
>  #include <unistd.h>
> +#include <stdlib.h>
> +#include <stdint.h>
> +#include <limits.h>
>  #include <sqlite3.h>
>  #include <linux/limits.h>
>  
>  #include "xlog.h"
>  #include "sqlite.h"
> +#include "cld.h"
> +#include "cld-internal.h"
>  
> -#define CLTRACK_SQLITE_LATEST_SCHEMA_VERSION 2
> +#define CLD_SQLITE_LATEST_SCHEMA_VERSION 3
>  
>  /* in milliseconds */
> -#define CLTRACK_SQLITE_BUSY_TIMEOUT 10000
> +#define CLD_SQLITE_BUSY_TIMEOUT 10000
>  
>  /* private data structures */
>  
> @@ -124,7 +136,7 @@ out:
>  }
>  
>  static int
> -sqlite_maindb_update_v1_to_v2(void)
> +sqlite_maindb_update_schema(int oldversion)
>  {
>  	int ret, ret2;
>  	char *err;
> @@ -142,32 +154,66 @@ sqlite_maindb_update_v1_to_v2(void)
>  	 * transaction to guard against racing DB setup attempts
>  	 */
>  	ret = sqlite_query_schema_version();
> -	switch (ret) {
> -	case 1:
> -		/* Still at v1 -- do conversion */
> -		break;
> -	case CLTRACK_SQLITE_LATEST_SCHEMA_VERSION:
> -		/* Someone else raced in and set it up */
> -		ret = 0;
> +	if (ret != oldversion) {
> +		if (ret == CLD_SQLITE_LATEST_SCHEMA_VERSION)
> +			/* Someone else raced in and set it up */
> +			ret = 0;
> +		else
> +			/* Something went wrong -- fail! */
> +			ret = -EINVAL;
>  		goto rollback;
> -	default:
> -		/* Something went wrong -- fail! */
> -		ret = -EINVAL;
> +	}
> +
> +	/* Still at old version -- do conversion */
> +
> +	/* create grace table */
> +	ret = sqlite3_exec(dbh, "CREATE TABLE grace "
> +				"(current INTEGER , recovery INTEGER);",
> +				NULL, NULL, &err);
> +	if (ret != SQLITE_OK) {
> +		xlog(L_ERROR, "Unable to create grace table: %s", err);
> +		goto rollback;
> +	}
> +
> +	/* insert initial epochs into grace table */
> +	ret = sqlite3_exec(dbh, "INSERT OR FAIL INTO grace "
> +				"values (1, 0);",
> +				NULL, NULL, &err);
> +	if (ret != SQLITE_OK) {
> +		xlog(L_ERROR, "Unable to set initial epochs: %s", err);
> +		goto rollback;
> +	}
> +
> +	/* create recovery table for current epoch */
> +	ret = sqlite3_exec(dbh, "CREATE TABLE \"rec-0000000000000001\" "
> +				"(id BLOB PRIMARY KEY);",
> +				NULL, NULL, &err);
> +	if (ret != SQLITE_OK) {
> +		xlog(L_ERROR, "Unable to create recovery table "
> +				"for current epoch: %s", err);
> +		goto rollback;
> +	}
> +
> +	/* copy records from old clients table */
> +	ret = sqlite3_exec(dbh, "INSERT INTO \"rec-0000000000000001\" "
> +				"SELECT id FROM clients;",
> +				NULL, NULL, &err);
> +	if (ret != SQLITE_OK) {
> +		xlog(L_ERROR, "Unable to copy client records: %s", err);
>  		goto rollback;
>  	}
>  
> -	/* create v2 clients table */
> -	ret = sqlite3_exec(dbh, "ALTER TABLE clients ADD COLUMN "
> -				"has_session INTEGER;",
> +	/* drop the old clients table */
> +	ret = sqlite3_exec(dbh, "DROP TABLE clients;",
>  				NULL, NULL, &err);
>  	if (ret != SQLITE_OK) {
> -		xlog(L_ERROR, "Unable to update clients table: %s", err);
> +		xlog(L_ERROR, "Unable to drop old clients table: %s", err);
>  		goto rollback;
>  	}
>  
>  	ret = snprintf(buf, sizeof(buf), "UPDATE parameters SET value = %d "
>  			"WHERE key = \"version\";",
> -			CLTRACK_SQLITE_LATEST_SCHEMA_VERSION);
> +			CLD_SQLITE_LATEST_SCHEMA_VERSION);
>  	if (ret < 0) {
>  		xlog(L_ERROR, "sprintf failed!");
>  		goto rollback;
> @@ -205,7 +251,7 @@ rollback:
>   * transaction. On any error, rollback the transaction.
>   */
>  static int
> -sqlite_maindb_init_v2(void)
> +sqlite_maindb_init_v3(void)
>  {
>  	int ret, ret2;
>  	char *err = NULL;
> @@ -227,7 +273,7 @@ sqlite_maindb_init_v2(void)
>  	case 0:
>  		/* Query failed again -- set up DB */
>  		break;
> -	case CLTRACK_SQLITE_LATEST_SCHEMA_VERSION:
> +	case CLD_SQLITE_LATEST_SCHEMA_VERSION:
>  		/* Someone else raced in and set it up */
>  		ret = 0;
>  		goto rollback;
> @@ -245,20 +291,38 @@ sqlite_maindb_init_v2(void)
>  		goto rollback;
>  	}
>  
> -	/* create the "clients" table */
> -	ret = sqlite3_exec(dbh, "CREATE TABLE clients (id BLOB PRIMARY KEY, "
> -				"time INTEGER, has_session INTEGER);",
> +	/* create grace table */
> +	ret = sqlite3_exec(dbh, "CREATE TABLE grace "
> +				"(current INTEGER , recovery INTEGER);",
> 				NULL, NULL, &err);
>  	if (ret != SQLITE_OK) {
> -		xlog(L_ERROR, "Unable to create clients table: %s", err);
> +		xlog(L_ERROR, "Unable to create grace table: %s", err);
>  		goto rollback;
>  	}
>  
> +	/* insert initial epochs into grace table */
> +	ret = sqlite3_exec(dbh, "INSERT OR FAIL INTO grace "
> +				"values (1, 0);",
> +				NULL, NULL, &err);
> +	if (ret != SQLITE_OK) {
> +		xlog(L_ERROR, "Unable to set initial epochs: %s", err);
> +		goto rollback;
> +	}
> +
> +	/* create recovery table for current epoch */
> +	ret = sqlite3_exec(dbh, "CREATE TABLE \"rec-0000000000000001\" "
> +				"(id BLOB PRIMARY KEY);",
> +				NULL, NULL, &err);
> +	if (ret != SQLITE_OK) {
> +		xlog(L_ERROR, "Unable to create recovery table "
> +				"for current epoch: %s", err);
> +		goto rollback;
> +	}
>  
>  	/* insert version into parameters table */
>  	ret = snprintf(buf, sizeof(buf), "INSERT OR FAIL INTO parameters "
>  			"values (\"version\", \"%d\");",
> -			CLTRACK_SQLITE_LATEST_SCHEMA_VERSION);
> +			CLD_SQLITE_LATEST_SCHEMA_VERSION);
>  	if (ret < 0) {
>  		xlog(L_ERROR, "sprintf failed!");
>  		goto rollback;
> @@ -291,6 +355,42 @@ rollback:
>  	goto out;
>  }
>  
> +static int
> +sqlite_startup_query_grace(void)
> +{
> +	int ret;
> +	uint64_t tcur;
> +	uint64_t trec;
> +	sqlite3_stmt *stmt = NULL;
> +
> +	/* prepare select query */
> +	ret = sqlite3_prepare_v2(dbh, "SELECT * FROM grace;", -1, &stmt, NULL);
> +	if (ret != SQLITE_OK) {
> +		xlog(D_GENERAL, "Unable to prepare select statement: %s",
> +			sqlite3_errmsg(dbh));
> +		goto out;
> +	}
> +
> +	ret = sqlite3_step(stmt);
> +	if (ret != SQLITE_ROW) {
> +		xlog(D_GENERAL, "Select statement execution failed: %s",
> +				sqlite3_errmsg(dbh));
> +		goto out;
> +	}
> +
> +	tcur = (uint64_t)sqlite3_column_int(stmt, 0);
> +	trec = (uint64_t)sqlite3_column_int(stmt, 1);

I think you want to use sqlite3_column_int64 here:

https://www.sqlite.org/c3ref/column_blob.html

> +
> +	current_epoch = tcur;
> +	recovery_epoch = trec;
> +	ret = 0;
> +	xlog(D_GENERAL, "%s: current_epoch=%lu recovery_epoch=%lu",
> +		__func__, current_epoch, recovery_epoch);
> +out:
> +	sqlite3_finalize(stmt);
> +	return ret;
> +}
> +
>  /* Open the database and set up the database handle for it */
>  int
>  sqlite_prepare_dbh(const char *topdir)
> @@ -322,7 +422,7 @@ sqlite_prepare_dbh(const char *topdir)
>  	}
>  
>  	/* set busy timeout */
> -	ret = sqlite3_busy_timeout(dbh, CLTRACK_SQLITE_BUSY_TIMEOUT);
> +	ret = sqlite3_busy_timeout(dbh, CLD_SQLITE_BUSY_TIMEOUT);
>  	if (ret != SQLITE_OK) {
>  		xlog(L_ERROR, "Unable to set sqlite busy timeout: %s",
>  				sqlite3_errmsg(dbh));
> @@ -331,19 +431,26 @@ sqlite_prepare_dbh(const char *topdir)
>  
>  	ret = sqlite_query_schema_version();
>  	switch (ret) {
> -	case CLTRACK_SQLITE_LATEST_SCHEMA_VERSION:
> +	case CLD_SQLITE_LATEST_SCHEMA_VERSION:
>  		/* DB is already set up. Do nothing */
>  		ret = 0;
>  		break;
> +	case 2:
> +		/* Old DB -- update to new schema */
> +		ret = sqlite_maindb_update_schema(2);
> +		if (ret)
> +			goto out_close;
> +		break;
> +
>  	case 1:
>  		/* Old DB -- update to new schema */
> -		ret = sqlite_maindb_update_v1_to_v2();
> +		ret = sqlite_maindb_update_schema(1);
>  		if (ret)
>  			goto out_close;
>  		break;
>  	case 0:
>  		/* Query failed -- try to set up new DB */
> -		ret = sqlite_maindb_init_v2();
> +		ret = sqlite_maindb_init_v3();
>  		if (ret)
>  			goto out_close;
>  		break;
> @@ -351,11 +458,13 @@ sqlite_prepare_dbh(const char *topdir)
>  		/* Unknown DB version -- downgrade? Fail */
>  		xlog(L_ERROR, "Unsupported database schema version! "
>  			"Expected %d, got %d.",
> -			CLTRACK_SQLITE_LATEST_SCHEMA_VERSION, ret);
> +			CLD_SQLITE_LATEST_SCHEMA_VERSION, ret);
>  		ret = -EINVAL;
>  		goto out_close;
>  	}
>  
> +	ret = sqlite_startup_query_grace();
> +
>  	return ret;
>  out_close:
>  	sqlite3_close(dbh);
> @@ -369,20 +478,22 @@ out_close:
>   * Returns a non-zero sqlite error code, or SQLITE_OK (aka 0)
>   */
>  int
> -sqlite_insert_client(const unsigned char *clname, const size_t namelen,
> -			const bool has_session, const bool zerotime)
> +sqlite_insert_client(const unsigned char *clname, const size_t namelen)
>  {
>  	int ret;
>  	sqlite3_stmt *stmt = NULL;
>  
> -	if (zerotime)
> -		ret = sqlite3_prepare_v2(dbh, "INSERT OR REPLACE INTO clients "
> -				"VALUES (?, 0, ?);", -1, &stmt, NULL);
> -	else
> -		ret = sqlite3_prepare_v2(dbh, "INSERT OR REPLACE INTO clients "
> -				"VALUES (?, strftime('%s', 'now'), ?);", -1,
> -				&stmt, NULL);
> +	ret = snprintf(buf, sizeof(buf), "INSERT OR REPLACE INTO \"rec-%016lx\" "
> +				"VALUES (?);", current_epoch);
> +	if (ret < 0) {
> +		xlog(L_ERROR, "sprintf failed!");
> +		return ret;
> +	} else if ((size_t)ret >= sizeof(buf)) {
> +		xlog(L_ERROR, "sprintf output too long! (%d chars)", ret);
> +		return -EINVAL;
> +	}
>  
> +	ret = sqlite3_prepare_v2(dbh, buf, -1, &stmt, NULL);
>  	if (ret != SQLITE_OK) {
>  		xlog(L_ERROR, "%s: insert statement prepare failed: %s",
>  			__func__, sqlite3_errmsg(dbh));
> @@ -397,13 +508,6 @@ sqlite_insert_client(const unsigned char *clname, const size_t namelen,
>  		goto out_err;
>  	}
>  
> -	ret = sqlite3_bind_int(stmt, 2, (int)has_session);
> -	if (ret != SQLITE_OK) {
> -		xlog(L_ERROR, "%s: bind int failed: %s", __func__,
> -				sqlite3_errmsg(dbh));
> -		goto out_err;
> -	}
> -
>  	ret = sqlite3_step(stmt);
>  	if (ret == SQLITE_DONE)
>  		ret = SQLITE_OK;
> @@ -424,8 +528,18 @@ sqlite_remove_client(const unsigned char *clname, const size_t namelen)
>  	int ret;
>  	sqlite3_stmt *stmt = NULL;
>  
> -	ret = sqlite3_prepare_v2(dbh, "DELETE FROM clients WHERE id==?", -1,
> -				 &stmt, NULL);
> +	ret = snprintf(buf, sizeof(buf), "DELETE FROM \"rec-%016lx\" "
> +				"WHERE id==?;", current_epoch);
> +	if (ret < 0) {
> +		xlog(L_ERROR, "sprintf failed!");
> +		return ret;
> +	} else if ((size_t)ret >= sizeof(buf)) {
> +		xlog(L_ERROR, "sprintf output too long! (%d chars)", ret);
> +		return -EINVAL;
> +	}
> +
> +	ret = sqlite3_prepare_v2(dbh, buf, -1, &stmt, NULL);
> +
>  	if (ret != SQLITE_OK) {
>  		xlog(L_ERROR, "%s: statement prepare failed: %s",
>  				__func__, sqlite3_errmsg(dbh));
> @@ -459,18 +573,26 @@ out_err:
>   * return an error.
>   */
>  int
> -sqlite_check_client(const unsigned char *clname, const size_t namelen,
> -			const bool has_session)
> +sqlite_check_client(const unsigned char *clname, const size_t namelen)
>  {
>  	int ret;
>  	sqlite3_stmt *stmt = NULL;
>  
> -	ret = sqlite3_prepare_v2(dbh, "SELECT count(*) FROM clients WHERE "
> -				      "id==?", -1, &stmt, NULL);
> +	ret = snprintf(buf, sizeof(buf), "SELECT count(*) FROM  \"rec-%016lx\" "
> +				"WHERE id==?;", recovery_epoch);
> +	if (ret < 0) {
> +		xlog(L_ERROR, "sprintf failed!");
> +		return ret;
> +	} else if ((size_t)ret >= sizeof(buf)) {
> +		xlog(L_ERROR, "sprintf output too long! (%d chars)", ret);
> +		return -EINVAL;
> +	}
> +
> +	ret = sqlite3_prepare_v2(dbh, buf, -1, &stmt, NULL);
>  	if (ret != SQLITE_OK) {
> -		xlog(L_ERROR, "%s: unable to prepare update statement: %s",
> -				__func__, sqlite3_errmsg(dbh));
> -		goto out_err;
> +		xlog(L_ERROR, "%s: select statement prepare failed: %s",
> +			__func__, sqlite3_errmsg(dbh));
> +		return ret;
>  	}
>  
>  	ret = sqlite3_bind_blob(stmt, 1, (const void *)clname, namelen,
> @@ -495,37 +617,10 @@ sqlite_check_client(const unsigned char *clname, const size_t namelen,
>  		goto out_err;
>  	}
>  
> -	/* Only update timestamp for v4.0 clients */
> -	if (has_session) {
> -		ret = SQLITE_OK;
> -		goto out_err;
> -	}
> -
>  	sqlite3_finalize(stmt);
> -	stmt = NULL;
> -	ret = sqlite3_prepare_v2(dbh, "UPDATE OR FAIL clients SET "
> -				      "time=strftime('%s', 'now') WHERE id==?",
> -				 -1, &stmt, NULL);
> -	if (ret != SQLITE_OK) {
> -		xlog(L_ERROR, "%s: unable to prepare update statement: %s",
> -				__func__, sqlite3_errmsg(dbh));
> -		goto out_err;
> -	}
>  
> -	ret = sqlite3_bind_blob(stmt, 1, (const void *)clname, namelen,
> -				SQLITE_STATIC);
> -	if (ret != SQLITE_OK) {
> -		xlog(L_ERROR, "%s: bind blob failed: %s",
> -				__func__, sqlite3_errmsg(dbh));
> -		goto out_err;
> -	}
> -
> -	ret = sqlite3_step(stmt);
> -	if (ret == SQLITE_DONE)
> -		ret = SQLITE_OK;
> -	else
> -		xlog(L_ERROR, "%s: unexpected return code from update: %s",
> -				__func__, sqlite3_errmsg(dbh));
> +	/* Now insert the client into the table for the current epoch */
> +	return sqlite_insert_client(clname, namelen);
>  
>  out_err:
>  	xlog(D_GENERAL, "%s: returning %d", __func__, ret);
> @@ -599,3 +694,211 @@ sqlite_query_reclaiming(const time_t grace_start)
>  			"reclaim", __func__, ret);
>  	return ret;
>  }
> +
> +int
> +sqlite_grace_start(void)
> +{
> +	int ret, ret2;
> +	char *err;
> +	uint64_t tcur = current_epoch;
> +	uint64_t trec = recovery_epoch;
> +
> +	/* begin transaction */
> +	ret = sqlite3_exec(dbh, "BEGIN EXCLUSIVE TRANSACTION;", NULL, NULL,
> +				&err);
> +	if (ret != SQLITE_OK) {
> +		xlog(L_ERROR, "Unable to begin transaction: %s", err);
> +		goto rollback;
> +	}
> +
> +	if (trec == 0) {
> +		/*
> +		 * A normal grace start - update the epoch values in the grace
> +		 * table and create a new table for the current reboot epoch.
> +		 */
> +		trec = tcur;
> +		tcur++;
> +
> +		ret = snprintf(buf, sizeof(buf), "UPDATE grace "
> +				"SET current = %ld, recovery = %ld;",
> +				(int64_t)tcur, (int64_t)trec);
> +		if (ret < 0) {
> +			xlog(L_ERROR, "sprintf failed!");
> +			goto rollback;
> +		} else if ((size_t)ret >= sizeof(buf)) {
> +			xlog(L_ERROR, "sprintf output too long! (%d chars)",
> +				ret);
> +			ret = -EINVAL;
> +			goto rollback;
> +		}
> +
> +		ret = sqlite3_exec(dbh, (const char *)buf, NULL, NULL, &err);
> +		if (ret != SQLITE_OK) {
> +			xlog(L_ERROR, "Unable to update epochs: %s", err);
> +			goto rollback;
> +		}
> +
> +		ret = snprintf(buf, sizeof(buf), "CREATE TABLE \"rec-%016lx\" "
> +				"(id BLOB PRIMARY KEY);",
> +				tcur);
> +		if (ret < 0) {
> +			xlog(L_ERROR, "sprintf failed!");
> +			goto rollback;
> +		} else if ((size_t)ret >= sizeof(buf)) {
> +			xlog(L_ERROR, "sprintf output too long! (%d chars)",
> +				ret);
> +			ret = -EINVAL;
> +			goto rollback;
> +		}
> +
> +		ret = sqlite3_exec(dbh, (const char *)buf, NULL, NULL, &err);
> +		if (ret != SQLITE_OK) {
> +			xlog(L_ERROR, "Unable to create table for current epoch: %s",
> +				err);
> +			goto rollback;
> +		}
> +	} else {
> +		/* Server restarted while in grace - don't update the epoch
> +		 * values in the grace table, just clear out the records for
> +		 * the current reboot epoch.
> +		 */
> +		ret = snprintf(buf, sizeof(buf), "DELETE FROM \"rec-%016lx\";",
> +				tcur);
> +		if (ret < 0) {
> +			xlog(L_ERROR, "sprintf failed!");
> +			goto rollback;
> +		} else if ((size_t)ret >= sizeof(buf)) {
> +			xlog(L_ERROR, "sprintf output too long! (%d chars)", ret);
> +			ret = -EINVAL;
> +			goto rollback;
> +		}
> +
> +		ret = sqlite3_exec(dbh, (const char *)buf, NULL, NULL, &err);
> +		if (ret != SQLITE_OK) {
> +			xlog(L_ERROR, "Unable to clear table for current epoch: %s",
> +				err);
> +			goto rollback;
> +		}
> +	}
> +
> +	ret = sqlite3_exec(dbh, "COMMIT TRANSACTION;", NULL, NULL, &err);
> +	if (ret != SQLITE_OK) {
> +		xlog(L_ERROR, "Unable to commit transaction: %s", err);
> +		goto rollback;
> +	}
> +
> +	current_epoch = tcur;
> +	recovery_epoch = trec;
> +	xlog(D_GENERAL, "%s: current_epoch=%lu recovery_epoch=%lu",
> +		__func__, current_epoch, recovery_epoch);
> +
> +out:
> +	sqlite3_free(err);
> +	return ret;
> +rollback:
> +	ret2 = sqlite3_exec(dbh, "ROLLBACK TRANSACTION;", NULL, NULL, &err);
> +	if (ret2 != SQLITE_OK)
> +		xlog(L_ERROR, "Unable to rollback transaction: %s", err);
> +	goto out;
> +}
> +
> +int
> +sqlite_grace_done(void)
> +{
> +	int ret, ret2;
> +	char *err;
> +
> +	/* begin transaction */
> +	ret = sqlite3_exec(dbh, "BEGIN EXCLUSIVE TRANSACTION;", NULL, NULL,
> +				&err);
> +	if (ret != SQLITE_OK) {
> +		xlog(L_ERROR, "Unable to begin transaction: %s", err);
> +		goto rollback;
> +	}
> +
> +	ret = sqlite3_exec(dbh, "UPDATE grace SET recovery = \"0\";",
> +			NULL, NULL, &err);
> +	if (ret != SQLITE_OK) {
> +		xlog(L_ERROR, "Unable to clear recovery epoch: %s", err);
> +		goto rollback;
> +	}
> +
> +	ret = snprintf(buf, sizeof(buf), "DROP TABLE \"rec-%016lx\";",
> +		recovery_epoch);
> +	if (ret < 0) {
> +		xlog(L_ERROR, "sprintf failed!");
> +		goto rollback;
> +	} else if ((size_t)ret >= sizeof(buf)) {
> +		xlog(L_ERROR, "sprintf output too long! (%d chars)", ret);
> +		ret = -EINVAL;
> +		goto rollback;
> +	}
> +
> +	ret = sqlite3_exec(dbh, (const char *)buf, NULL, NULL, &err);
> +	if (ret != SQLITE_OK) {
> +		xlog(L_ERROR, "Unable to drop table for recovery epoch: %s",
> +			err);
> +		goto rollback;
> +	}
> +
> +	ret = sqlite3_exec(dbh, "COMMIT TRANSACTION;", NULL, NULL, &err);
> +	if (ret != SQLITE_OK) {
> +		xlog(L_ERROR, "Unable to commit transaction: %s", err);
> +		goto rollback;
> +	}
> +
> +	recovery_epoch = 0;
> +	xlog(D_GENERAL, "%s: current_epoch=%lu recovery_epoch=%lu",
> +		__func__, current_epoch, recovery_epoch);
> +
> +out:
> +	sqlite3_free(err);
> +	return ret;
> +rollback:
> +	ret2 = sqlite3_exec(dbh, "ROLLBACK TRANSACTION;", NULL, NULL, &err);
> +	if (ret2 != SQLITE_OK)
> +		xlog(L_ERROR, "Unable to rollback transaction: %s", err);
> +	goto out;
> +}
> +
> +
> +int
> +sqlite_iterate_recovery(int (*cb)(struct cld_client *clnt), struct cld_client *clnt)
> +{
> +	int ret;
> +	sqlite3_stmt *stmt = NULL;
> +	struct cld_msg *cmsg = &clnt->cl_msg;
> +
> +	if (recovery_epoch == 0) {
> +		xlog(D_GENERAL, "%s: not in grace!", __func__);
> +		return -EINVAL;
> +	}
> +
> +	ret = snprintf(buf, sizeof(buf), "SELECT * FROM \"rec-%016lx\";",
> +		recovery_epoch);
> +	if (ret < 0) {
> +		xlog(L_ERROR, "sprintf failed!");
> +		return ret;
> +	} else if ((size_t)ret >= sizeof(buf)) {
> +		xlog(L_ERROR, "sprintf output too long! (%d chars)", ret);
> +		return -EINVAL;
> +	}
> +
> +	ret = sqlite3_prepare_v2(dbh, buf, -1, &stmt, NULL);
> +	if (ret != SQLITE_OK) {
> +		xlog(L_ERROR, "%s: select statement prepare failed: %s",
> +			__func__, sqlite3_errmsg(dbh));
> +		return ret;
> +	}
> +
> +	while ((ret = sqlite3_step(stmt)) == SQLITE_ROW) {
> +		memcpy(&cmsg->cm_u.cm_name.cn_id, sqlite3_column_blob(stmt, 0),
> +			NFS4_OPAQUE_LIMIT);
> +		cmsg->cm_u.cm_name.cn_len = sqlite3_column_bytes(stmt, 0);
> +		cb(clnt);
> +	}
> +	if (ret == SQLITE_DONE)
> +		ret = 0;
> +	sqlite3_finalize(stmt);
> +	return ret;
> +}
> diff --git a/utils/nfsdcld/sqlite.h b/utils/nfsdcld/sqlite.h
> index 06e7c04..5c56f75 100644
> --- a/utils/nfsdcld/sqlite.h
> +++ b/utils/nfsdcld/sqlite.h
> @@ -20,13 +20,16 @@
>  #ifndef _SQLITE_H_
>  #define _SQLITE_H_
>  
> +struct cld_client;
> +
>  int sqlite_prepare_dbh(const char *topdir);
> -int sqlite_insert_client(const unsigned char *clname, const size_t namelen,
> -				const bool has_session, const bool zerotime);
> +int sqlite_insert_client(const unsigned char *clname, const size_t namelen);
>  int sqlite_remove_client(const unsigned char *clname, const size_t namelen);
> -int sqlite_check_client(const unsigned char *clname, const size_t namelen,
> -				const bool has_session);
> +int sqlite_check_client(const unsigned char *clname, const size_t namelen);
>  int sqlite_remove_unreclaimed(const time_t grace_start);
>  int sqlite_query_reclaiming(const time_t grace_start);
> +int sqlite_grace_start(void);
> +int sqlite_grace_done(void);
> +int sqlite_iterate_recovery(int (*cb)(struct cld_client *clnt), struct cld_client *clnt);
>  
>  #endif /* _SQLITE_H */

-- 
Jeff Layton <jlayton@xxxxxxxxxx>




[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux