[RFC][PATCH] ns: Syscalls for better namespace sharing control. v2

ebiederm@xxxxxxxxxxxx (Eric W. Biederman) · Thu, 25 Feb 2010 19:15:28 -0800

Introduce two new system calls:
int nsfd(pid_t pid, unsigned long nstype);
int setns(unsigned long nstype, int fd);

These two new system calls address three specific problems that can
make namespaces hard to work with.
- Namespaces require a dedicated process to pin them in memory.
- It is not possible to use a namespace unless you are the
  child of the original creator.
- Namespaces don't have names that userspace can use to talk
  about them.

The nsfd() system call returns a file descriptor that can
be used to talk about a specific namespace, and to keep
the specified namespace alive.

The file descriptor returned from nsfd has the lifetime
semantics of a deleted file.  As long as the fd is
open or it is bind mounted into the filesystem
namespace the namespace will be kept alive.

The fd returned by nsfd() can be bind mounted as:
mount --bind /proc/self/fd/N /some/filesystem/path

open works on the fd returned by nsfd() so another
process can get a hold of it and do interesting things.

Overall that allows for naming of namespaces with
userspace policy.

setns() allows changing the namespace of the current process
to a namespace that originates with nsfd().

v2: The code is tested and works in the common case.
    The vfs has some of the strangest rules...

Signed-off-by: Eric W. Biederman <ebiederm@xxxxxxxxxxxx>
---

Enough for one day.  This code works, now it just needs
a some more use/testing and careful scrutiny before 2.6.35 rolls
around.

 arch/x86/ia32/ia32entry.S          |    2 +
 arch/x86/include/asm/unistd_32.h   |    4 +-
 arch/x86/include/asm/unistd_64.h   |    4 +
 arch/x86/kernel/syscall_table_32.S |    2 +
 fs/Makefile                        |    2 +-
 fs/nsfd.c                          |  320 ++++++++++++++++++++++++++++++++++++
 include/linux/magic.h              |    1 +
 include/linux/nsproxy.h            |    1 +
 include/linux/nstype.h             |    6 +
 kernel/nsproxy.c                   |   17 ++
 10 files changed, 357 insertions(+), 2 deletions(-)
 create mode 100644 fs/nsfd.c
 create mode 100644 include/linux/nstype.h

diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 53147ad..9fd33de 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -842,4 +842,6 @@ ia32_sys_call_table:
 	.quad compat_sys_rt_tgsigqueueinfo	/* 335 */
 	.quad sys_perf_event_open
 	.quad compat_sys_recvmmsg
+	.quad sys_nsfd
+	.quad sys_setns
 ia32_syscall_end:
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 3baf379..5b7833c 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -343,10 +343,12 @@
 #define __NR_rt_tgsigqueueinfo	335
 #define __NR_perf_event_open	336
 #define __NR_recvmmsg		337
+#define __NR_nsfd		338
+#define __NR_setns		339
 
 #ifdef __KERNEL__
 
-#define NR_syscalls 338
+#define NR_syscalls 340
 
 #define __ARCH_WANT_IPC_PARSE_VERSION
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 4843f7b..260d542 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -663,6 +663,10 @@ __SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
 __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
 #define __NR_recvmmsg				299
 __SYSCALL(__NR_recvmmsg, sys_recvmmsg)
+#define __NR_nsfd				300
+__SYSCALL(__NR_nsfd, sys_nsfd)
+#define __NR_setns				301
+__SYSCALL(__NR_setns, sys_setns)
 
 #ifndef __NO_STUBS
 #define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 15228b5..e09a45b 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -337,3 +337,5 @@ ENTRY(sys_call_table)
 	.long sys_rt_tgsigqueueinfo	/* 335 */
 	.long sys_perf_event_open
 	.long sys_recvmmsg
+	.long sys_nsfd
+	.long sys_setns
diff --git a/fs/Makefile b/fs/Makefile
index af6d047..74d5091 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o drop_caches.o splice.o sync.o utimes.o \
-		stack.o fs_struct.o
+		stack.o fs_struct.o nsfd.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/nsfd.c b/fs/nsfd.c
new file mode 100644
index 0000000..ec04a1e
--- /dev/null
+++ b/fs/nsfd.c
@@ -0,0 +1,320 @@
+#include <linux/nstype.h>
+#include <linux/fs.h>
+#include <linux/magic.h>
+#include <net/net_namespace.h>
+#include <linux/file.h>
+#include <linux/mount.h>
+#include <linux/cred.h>
+#include <linux/sched.h>
+#include <linux/ptrace.h>
+#include <linux/nsproxy.h>
+#include <linux/kernel.h>
+#include <linux/syscalls.h>
+#include <linux/fs_struct.h>
+
+static struct vfsmount *nsfd_mnt __read_mostly;
+static struct inode *nsfd_inode;
+
+static const struct file_operations nsfd_file_operations = {
+	.llseek = no_llseek,
+};
+
+static const struct super_operations nsfd_super_operations = {
+	.statfs		= simple_statfs,
+};
+
+static char *nsfd_dname(struct dentry *dentry, char *buffer, int buflen)
+{
+	static const char name[] = "nsfd";
+
+	if (sizeof(name) > buflen)
+		return ERR_PTR(-ENAMETOOLONG);
+
+	return memcpy(buffer, name, sizeof(name));
+}
+
+static const struct dentry_operations nsfd_dentry_operations = {
+	.d_dname		= nsfd_dname,
+};
+
+static struct inode *nsfd_mkinode(struct super_block *sb)
+{
+	struct inode *inode;
+
+	inode = new_inode(sb);
+	if (!inode)
+		return ERR_PTR(-ENOMEM);
+
+	inode->i_fop = &nsfd_file_operations;
+
+	/*
+	 * Mark the inode dirty from the very beginning,
+	 * that way it will never be moved to the dirty
+	 * list because mark_inode_dirty() will think that
+	 * it already _is_ on the dirty list.
+	 */
+	inode->i_state	= I_DIRTY;
+	inode->i_ino	= 1;
+	inode->i_mode	= S_IFREG | S_IRUSR | S_IWUSR;
+	inode->i_atime	= inode->i_mtime = inode->i_ctime = CURRENT_TIME;
+	inode->i_flags	= S_IMMUTABLE;
+
+	return inode;
+}
+
+static struct dentry *nsfd_alloc_dentry(struct inode *inode)
+{
+	struct dentry *dentry;
+
+	/*
+	 * We know the nsfd_inode inode count is always greater than zero,
+	 * so we can avoid doing an igrab() and we can use an open-coded
+	 * atomic_inc().
+	 */
+	dentry = d_alloc_root(inode);
+	if (dentry) {
+		atomic_inc(&inode->i_count);
+		dentry->d_op = &nsfd_dentry_operations;
+	}
+	return dentry;
+}
+
+static int nsfd_fill_super(struct super_block *sb, void *data, int silent)
+{
+	struct inode *inode = NULL;
+
+	sb->s_flags		= 0;
+	sb->s_maxbytes		= MAX_LFS_FILESIZE;
+	sb->s_blocksize		= PAGE_SIZE;
+	sb->s_blocksize_bits	= PAGE_SHIFT;
+	sb->s_magic 		= NSFD_FS_MAGIC;
+	sb->s_op		= &nsfd_super_operations;
+	sb->s_time_gran		= 1;
+
+	inode = nsfd_mkinode(sb);
+	if (!inode)
+		goto Enomem;
+
+	sb->s_root = nsfd_alloc_dentry(inode);
+	if (!sb->s_root)
+		goto Enomem;
+
+	/* Save the inode for later.. */
+	nsfd_inode = inode;
+
+	return 0;
+
+Enomem:
+	iput(inode);
+	return -ENOMEM;
+}
+
+static int nsfd_get_sb(struct file_system_type *fs_type, int flags,
+	const char *dev_name, void *data, struct vfsmount *mnt)
+{
+	/* We can't use get_sb_psuedo because that sets MS_NOUSER */
+	return get_sb_single(fs_type, 0, NULL, nsfd_fill_super, mnt);
+}
+
+
+static struct file_system_type nsfd_fs_type = {
+	.name		= "nsfd",
+	.get_sb		= nsfd_get_sb,
+	.kill_sb	= kill_anon_super,
+	
+};
+
+static void netns_dentry_release(struct dentry *dentry)
+{
+	put_net(dentry->d_fsdata);
+	dentry->d_fsdata = NULL;
+}
+
+static const struct dentry_operations netns_dentry_operations = {
+	.d_dname	= nsfd_dname,
+	.d_release	= netns_dentry_release,
+};
+
+static const struct dentry_operations *nsfd_dops[] = {
+	[NSTYPE_NET] = &netns_dentry_operations,
+};
+
+static const struct dentry_operations *nstype_dops(unsigned long nstype)
+{
+	const struct dentry_operations *d_op = NULL;
+
+	if (nstype < sizeof(nsfd_dops)/sizeof(nsfd_dops[0]))
+		d_op = nsfd_dops[nstype];
+
+	return d_op;
+}
+
+static struct file *nsfd_fget(int fd, unsigned long nstype)
+{
+	const struct dentry_operations *d_op;
+	struct file *file;
+
+	d_op = nstype_dops(nstype);
+	if (!d_op)
+		return ERR_PTR(-EINVAL);
+
+	file = fget(fd);
+	if (!file)
+		return ERR_PTR(-EBADF);
+
+	if (file->f_op != &nsfd_file_operations)
+		goto out_invalid;
+
+	if (file->f_path.dentry->d_op != d_op)
+		goto out_invalid;
+
+	return file;
+
+out_invalid:
+	fput(file);
+	return ERR_PTR(-EINVAL);
+}
+
+
+static struct file *nsfd_getfile(void)
+{
+	struct path path;
+	struct file *file;
+
+	path.dentry = nsfd_alloc_dentry(nsfd_inode);
+	if (!path.dentry)
+		return ERR_PTR(-ENOMEM);
+
+	/* HACK I need a vfsmnt with mnt_ns == current_nsproxy_mnt_ns
+	 * and (mnt_sb->s_flags & MS_NOUSER) == 0.  The only way I can
+	 * get such a vfsmount without having an instnace of my filesystem
+	 * mounted in the namespace is to steal one.
+	 */
+	path.mnt = mntget(current->fs->root.mnt);
+
+	file = alloc_file(&path, FMODE_READ, &nsfd_file_operations);
+	if (!file) {
+		path_put(&path);
+		return ERR_PTR(-ENFILE);
+	}
+	file->f_mapping = nsfd_inode->i_mapping;
+
+	file->f_pos = 0;
+	file->f_flags = O_RDONLY;
+	file->f_version = 0;
+	file->private_data = NULL;
+
+	return file;
+}
+
+static void *nsfd_getns(pid_t pid, unsigned long nstype)
+{
+	struct task_struct *task;
+	struct nsproxy *nsproxy;
+	void *ns;
+
+	ns = ERR_PTR(-ESRCH);
+	rcu_read_lock();
+	if (pid == 0)
+		task = current;
+	else
+		task = find_task_by_vpid(pid);
+	if (!task)
+		goto out;
+
+	ns = ERR_PTR(-EPERM);
+	if (!ptrace_may_access(task, PTRACE_MODE_ATTACH))
+		goto out;
+
+	ns = ERR_PTR(-ESRCH);
+	nsproxy = task_nsproxy(task);
+	if (!nsproxy)
+		goto out;
+
+	ns = ERR_PTR(-EINVAL);
+	switch(nstype) {
+	case NSTYPE_NET:
+		ns = get_net(nsproxy->net_ns);
+		break;
+	}
+out:
+	rcu_read_unlock();
+	return ns;
+}
+
+SYSCALL_DEFINE2(nsfd, pid_t, pid, unsigned long, nstype)
+{
+	const struct dentry_operations *d_op;
+	struct file *file;
+	int fd;
+	void *ns;
+
+	d_op = nstype_dops(nstype);
+	if (!d_op)
+		return -EINVAL;
+
+	file = nsfd_getfile();
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	ns = nsfd_getns(pid, nstype);
+	if (IS_ERR(ns)) {
+		fput(file);
+		return PTR_ERR(ns);
+	}
+
+	file->f_dentry->d_fsdata = ns;
+	file->f_dentry->d_op = d_op;
+	
+	fd = get_unused_fd();
+	if (fd < 0) {
+		fput(file);
+		return fd;
+	}
+	fd_install(fd, file);
+
+	return fd;
+}
+
+
+SYSCALL_DEFINE2(setns, unsigned long, nstype, int, fd)
+{
+	struct file *file;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	file = nsfd_fget(fd, nstype);
+	if (IS_ERR(file))
+		return PTR_ERR(file);
+
+	set_namespace(nstype, file->f_dentry->d_fsdata);
+
+	fput(file);
+	return 0;
+}
+
+
+static int __init nsfd_init(void)
+{
+	int error;
+
+	error = register_filesystem(&nsfd_fs_type);
+	if (error)
+		goto err_exit;
+
+	nsfd_mnt  = kern_mount(&nsfd_fs_type);
+	if (IS_ERR(nsfd_mnt)) {
+		error = PTR_ERR(nsfd_mnt);
+		goto err_unregister_filesystem;
+	}
+
+	return 0;
+
+err_unregister_filesystem:
+	unregister_filesystem(&nsfd_fs_type);
+err_exit:
+	panic(KERN_ERR "nsfd_init() failed (%d)\n", error);
+}
+
+fs_initcall(nsfd_init);
diff --git a/include/linux/magic.h b/include/linux/magic.h
index 76285e0..a4fe6eb 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -26,6 +26,7 @@
 #define ISOFS_SUPER_MAGIC	0x9660
 #define JFFS2_SUPER_MAGIC	0x72b6
 #define ANON_INODE_FS_MAGIC	0x09041934
+#define NSFD_FS_MAGIC		0x6e736664
 
 #define MINIX_SUPER_MAGIC	0x137F		/* original minix fs */
 #define MINIX_SUPER_MAGIC2	0x138F		/* minix fs, 30 char names */
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index 7b370c7..45f1e07 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -65,6 +65,7 @@ static inline struct nsproxy *task_nsproxy(struct task_struct *tsk)
 int copy_namespaces(unsigned long flags, struct task_struct *tsk);
 void exit_task_namespaces(struct task_struct *tsk);
 void switch_task_namespaces(struct task_struct *tsk, struct nsproxy *new);
+void set_namespace(unsigned long nstype, void *ns);
 void free_nsproxy(struct nsproxy *ns);
 int unshare_nsproxy_namespaces(unsigned long, struct nsproxy **,
 	struct fs_struct *);
diff --git a/include/linux/nstype.h b/include/linux/nstype.h
new file mode 100644
index 0000000..3bdf856
--- /dev/null
+++ b/include/linux/nstype.h
@@ -0,0 +1,6 @@
+#ifndef _LINUX_NSTYPE_H
+#define _LINUX_NSTYPE_H
+
+#define NSTYPE_NET 0
+
+#endif /* _LINUX_NSTYPE_H */
diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c
index 09b4ff9..574461c 100644
--- a/kernel/nsproxy.c
+++ b/kernel/nsproxy.c
@@ -21,6 +21,7 @@
 #include <linux/pid_namespace.h>
 #include <net/net_namespace.h>
 #include <linux/ipc_namespace.h>
+#include <linux/nstype.h>
 
 static struct kmem_cache *nsproxy_cachep;
 
@@ -221,6 +222,22 @@ void exit_task_namespaces(struct task_struct *p)
 	switch_task_namespaces(p, NULL);
 }
 
+void set_namespace(unsigned long nstype, void *ns)
+{
+	struct task_struct *tsk = current;
+	struct nsproxy *new_nsproxy;
+
+	new_nsproxy = create_new_namespaces(0, tsk, tsk->fs);
+	switch(nstype) {
+	case NSTYPE_NET:
+		put_net(new_nsproxy->net_ns);
+		new_nsproxy->net_ns = get_net(ns);
+		break;
+	}
+
+	switch_task_namespaces(tsk, new_nsproxy);
+}
+
 static int __init nsproxy_cache_init(void)
 {
 	nsproxy_cachep = KMEM_CACHE(nsproxy, SLAB_PANIC);
-- 
1.6.5.2.143.g8cc62

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html