[RFC dlm/next 12/12] gfs2: separate mount context by net-namespaces

Alexander Aring <aahringo@xxxxxxxxxx> · Wed, 14 Aug 2024 10:34:14 -0400

This patch changes gfs2 to be the first user of the recently introduced
net-namespace feature of DLM. It uses the current net-namespace context
of the get_tree() callback of "struct fs_context_operations" that is
e.g. being called by syscalls of the coreutils mount process.

Debugfs is being disabled for non "init_net" net-namespaces as debugfs
is not net-namespace aware. Otherwise we would see kernel log error
messages about existing debugfs entries. There might be in future
another ways to get similar information with an UAPI mechanism that is
namespace aware.

With this patch you can setup a cluster in one machine without using
multiple Linux kernel instances. The idea is to use this for
developing/testing as a first use-case. Especially scale testing, a gfs2
file system with a lot of members can be setup without running a large
set of virtual machines.

There is a small example script how it works:

https://gitlab.com/netcoder/gfs2ns-examples/-/blob/main/three_nodes

it uses multiple loop block devices references to the same file to
provide a kind of shared block device (an alternative could be using dm
linear mapping, but loop block device also works). After setting up
multiple net-namespaces with a virtual bridge interface DLM can be
configured by using nldlmd that is using the recently introduced
net-namespace aware nldlm netlink API. Over a shell command like:

ip netns exec node1 sh -c "mount /dev/loop1 /cluster/node1"
ip netns exec node2 sh -c "mount /dev/loop2 /cluster/node2"
...

every cluster "node" (hereby identifier with a "node#" string in the
iproute2 namespace management mechanism) can mount the shared block
device e.g. /dev/loop1, /dev/loop# that references to the same block
image on a different file system. Note that the caller in the script
will be unmount the file system if the last user left the namespace. I
currently workaround this use case by mountbind each block device again
in a kind of "default" namespace (usually it should be where "init_net"
references to).

Then each mountpoint of "/cluster/node#" acts like being a cluster node
access to the gfs2 file system with DLM locking enabled. This offers us
new possibilities to write testcases as we operate on a single machine
and can "easier" synchronize vfs syscalls in our testcase as no
"remote machine" kind network communication is being involved.

The mount sysfs directory for gfs2 needs to be separated by
net-namespace as our mount call is from namespace view separated and be
treated as per "node" (machine) call. Otherwise we would run into
conflicts on files/directories that already exists as the cluster wide
unique identifiers are already being in use.

The gfs2 file system has some global variables and it seems it is not
necessary to separate them by net-namespace as they store a unique per
mount identifier. I didn't had problems yet and it seems it is currently
okay to do that, otherwise there might be issues with "things" that are
not net-namespace aware in gfs2 yet. However we only should find issues
only when we using net-namespaces. If a user only uses "init_net" as
this is the current default, there should not be any problems.

Signed-off-by: Alexander Aring <aahringo@xxxxxxxxxx>
---
 fs/gfs2/glock.c      |  8 ++++++++
 fs/gfs2/incore.h     |  2 ++
 fs/gfs2/lock_dlm.c   |  2 +-
 fs/gfs2/ops_fstype.c |  5 +++++
 fs/gfs2/sys.c        | 27 ++++++++++++++++++++++++++-
 5 files changed, 42 insertions(+), 2 deletions(-)

diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c
index 12a769077ea0..21c72c1f6c61 100644
--- a/fs/gfs2/glock.c
+++ b/fs/gfs2/glock.c
@@ -2898,6 +2898,10 @@ DEFINE_SEQ_ATTRIBUTE(gfs2_sbstats);
 
 void gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
 {
+	/* debugfs is only available for init_net users */
+	if (!net_eq(read_pnet(&sdp->net), &init_net))
+		return;
+
 	sdp->debugfs_dir = debugfs_create_dir(sdp->sd_table_name, gfs2_root);
 
 	debugfs_create_file("glocks", S_IFREG | S_IRUGO, sdp->debugfs_dir, sdp,
@@ -2915,6 +2919,10 @@ void gfs2_create_debugfs_file(struct gfs2_sbd *sdp)
 
 void gfs2_delete_debugfs_file(struct gfs2_sbd *sdp)
 {
+	/* debugfs is only available for init_net users */
+	if (!net_eq(read_pnet(&sdp->net), &init_net))
+		return;
+
 	debugfs_remove_recursive(sdp->debugfs_dir);
 	sdp->debugfs_dir = NULL;
 }
diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h
index aa4ef67a34e0..1273ef3dbef5 100644
--- a/fs/gfs2/incore.h
+++ b/fs/gfs2/incore.h
@@ -710,6 +710,8 @@ struct gfs2_sbd {
 
 	/* Lock Stuff */
 
+	possible_net_t net;
+	netns_tracker tracker;	/* keep track of net references */
 	struct lm_lockstruct sd_lockstruct;
 	struct gfs2_holder sd_live_gh;
 	struct gfs2_glock *sd_rename_gl;
diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c
index 6c5dce57a2ee..d8e1bb99dac2 100644
--- a/fs/gfs2/lock_dlm.c
+++ b/fs/gfs2/lock_dlm.c
@@ -1328,7 +1328,7 @@ static int gdlm_mount(struct gfs2_sbd *sdp, const char *table)
 	 * create/join lockspace
 	 */
 
-	error = dlm_new_lockspace(&init_net, fsname, cluster, flags,
+	error = dlm_new_lockspace(read_pnet(&sdp->net), fsname, cluster, flags,
 				  GDLM_LVB_SIZE, &gdlm_lockspace_ops, sdp,
 				  &ops_result, &ls->ls_dlm);
 	if (error) {
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index ff1f3e3dc65c..15ee0b2228a5 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -64,6 +64,8 @@ static void gfs2_tune_init(struct gfs2_tune *gt)
 
 void free_sbd(struct gfs2_sbd *sdp)
 {
+	put_net_track(read_pnet(&sdp->net), &sdp->tracker);
+
 	if (sdp->sd_lkstats)
 		free_percpu(sdp->sd_lkstats);
 	kfree(sdp);
@@ -71,6 +73,7 @@ void free_sbd(struct gfs2_sbd *sdp)
 
 static struct gfs2_sbd *init_sbd(struct super_block *sb)
 {
+	struct net *net = current->nsproxy->net_ns;
 	struct gfs2_sbd *sdp;
 	struct address_space *mapping;
 
@@ -78,6 +81,8 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb)
 	if (!sdp)
 		return NULL;
 
+	/* store the net-namespace the sdp is created into */
+	write_pnet(&sdp->net, get_net_track(net, &sdp->tracker, GFP_KERNEL));
 	sdp->sd_vfs = sb;
 	sdp->sd_lkstats = alloc_percpu(struct gfs2_pcpu_lkstats);
 	if (!sdp->sd_lkstats)
diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c
index ecc699f8d9fc..9c23bafc592b 100644
--- a/fs/gfs2/sys.c
+++ b/fs/gfs2/sys.c
@@ -16,6 +16,7 @@
 #include <linux/uaccess.h>
 #include <linux/gfs2_ondisk.h>
 #include <linux/blkdev.h>
+#include <linux/netdevice.h>
 
 #include "gfs2.h"
 #include "incore.h"
@@ -57,6 +58,20 @@ static const struct sysfs_ops gfs2_attr_ops = {
 
 static struct kset *gfs2_kset;
 
+/* gfs2 sysfs is separated by net-namespaces */
+static const struct kobj_ns_type_operations *
+gfs2_sysfs_object_child_ns_type(const struct kobject *kobj)
+{
+	return &net_ns_type_operations;
+}
+
+static const struct kobj_type gfs2_kset_ktype = {
+	.sysfs_ops      = &kobj_sysfs_ops,
+	.release	= kset_release,
+	.get_ownership  = kset_get_ownership,
+	.child_ns_type  = gfs2_sysfs_object_child_ns_type,
+};
+
 static ssize_t id_show(struct gfs2_sbd *sdp, char *buf)
 {
 	return snprintf(buf, PAGE_SIZE, "%u:%u\n",
@@ -383,10 +398,19 @@ static void gfs2_sbd_release(struct kobject *kobj)
 	complete(&sdp->sd_kobj_unregister);
 }
 
+/* return the net-namespace the kobj belonging to */
+static const void *gfs2_kobj_namespace(const struct kobject *kobj)
+{
+	struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj);
+
+	return read_pnet(&sdp->net);
+}
+
 static struct kobj_type gfs2_ktype = {
 	.release = gfs2_sbd_release,
 	.default_groups = gfs2_groups,
 	.sysfs_ops     = &gfs2_attr_ops,
+	.namespace = gfs2_kobj_namespace,
 };
 
 
@@ -797,7 +821,8 @@ static const struct kset_uevent_ops gfs2_uevent_ops = {
 
 int gfs2_sys_init(void)
 {
-	gfs2_kset = kset_create_and_add("gfs2", &gfs2_uevent_ops, fs_kobj);
+	gfs2_kset = kset_type_create_and_add("gfs2", &gfs2_uevent_ops,
+					     fs_kobj, &gfs2_kset_ktype);
 	if (!gfs2_kset)
 		return -ENOMEM;
 	return 0;
-- 
2.43.0