Re: [RFC v5 2/2] seccomp: replace existing file in the epoll interface by a new file injected by the syscall supervisor.

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Sat, Apr 29, 2023 at 05:49:55AM +0000, aloktiagi wrote:
> Introduce a mechanism to replace a file linked in the epoll interface by a new
> file injected by the syscall supervisor by using the epoll provided
> eventpoll_replace_file() api.
> 
> Also introduce a new addfd flag SECCOMP_ADDFD_FLAG_REPLACE_REF to allow the supervisor
> to indicate that it is interested in getting the original file replaced by the
> new injected file.
> 
> We have a use case where multiple IPv6 only network namespaces can use a single
> IPv4 network namespace for IPv4 only egress connectivity by switching their
> sockets from IPv6 to IPv4 network namespace. This allows for migration of
> systems to IPv6 only while keeping their connectivity to IPv4 only destinations
> intact.
> 
> Today, we achieve this by setting up seccomp filter to intercept network system
> calls like connect() from a container in a syscall supervisor which runs in an
> IPv4 only network namespace. The syscall supervisor creates a new IPv4 connection
> and injects the new file descriptor through SECCOMP_NOTIFY_IOCTL_ADDFD replacing
> the original file descriptor from the connect() call. This does not work for
> cases where the original file descriptor is handed off to a system like epoll
> before the connect() call. After a new file descriptor is injected the original
> file descriptor being referenced by the epoll fd is not longer valid leading to
> failures. As a workaround the syscall supervisor when intercepting connect()
> loops through all open socket file descriptors to check if they are referencing
> the socket attempting the connect() and replace the reference with the to be
> injected file descriptor. This workaround is cumbersome and makes the solution
> prone to similar yet to be discovered issues.
> 
> The above change will enable us remove the workaround in the syscall supervisor
> and let the kernel handle the replacement correctly.
> 
> Signed-off-by: aloktiagi <aloktiagi@xxxxxxxxx>
> ---
>  include/uapi/linux/seccomp.h                  |   1 +
>  kernel/seccomp.c                              |  35 +++++-
>  tools/testing/selftests/seccomp/seccomp_bpf.c | 102 ++++++++++++++++++
>  3 files changed, 136 insertions(+), 2 deletions(-)
> 
> diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h
> index 0fdc6ef02b94..0a74dc5d967f 100644
> --- a/include/uapi/linux/seccomp.h
> +++ b/include/uapi/linux/seccomp.h
> @@ -118,6 +118,7 @@ struct seccomp_notif_resp {
>  /* valid flags for seccomp_notif_addfd */
>  #define SECCOMP_ADDFD_FLAG_SETFD	(1UL << 0) /* Specify remote fd */
>  #define SECCOMP_ADDFD_FLAG_SEND		(1UL << 1) /* Addfd and return it, atomically */
> +#define SECCOMP_ADDFD_FLAG_REPLACE_REF	(1UL << 2) /* Update replace references */
>  
>  /**
>   * struct seccomp_notif_addfd
> diff --git a/kernel/seccomp.c b/kernel/seccomp.c
> index cebf26445f9e..5b1b265b30d9 100644
> --- a/kernel/seccomp.c
> +++ b/kernel/seccomp.c
> @@ -19,6 +19,7 @@
>  #include <linux/audit.h>
>  #include <linux/compat.h>
>  #include <linux/coredump.h>
> +#include <linux/eventpoll.h>
>  #include <linux/kmemleak.h>
>  #include <linux/nospec.h>
>  #include <linux/prctl.h>
> @@ -1056,6 +1057,7 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter)
>  static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_knotif *n)
>  {
>  	int fd;
> +	struct file *old_file = NULL;
>  
>  	/*
>  	 * Remove the notification, and reset the list pointers, indicating
> @@ -1064,8 +1066,30 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn
>  	list_del_init(&addfd->list);
>  	if (!addfd->setfd)
>  		fd = receive_fd(addfd->file, addfd->flags);
> -	else
> +	else {
> +		int ret = 0;
> +		if (addfd->ioctl_flags & SECCOMP_ADDFD_FLAG_REPLACE_REF) {
> +			old_file = fget(addfd->fd);
> +			if (!old_file) {
> +				fd = -EBADF;
> +				goto error;
> +			}
> +			ret = eventpoll_replace_file(old_file, addfd->file, addfd->fd);
> +			if (ret < 0) {
> +				fd = ret;
> +				goto error;
> +			}
> +		}
>  		fd = receive_fd_replace(addfd->fd, addfd->file, addfd->flags);
> +		/* In case of error restore all references */
> +		if (fd < 0 && addfd->ioctl_flags & SECCOMP_ADDFD_FLAG_REPLACE_REF) {
> +			ret = eventpoll_replace_file(addfd->file, old_file, addfd->fd);
> +			if (ret < 0) {
> +				fd = ret;
> +			}
> +		}
> +	}
> +error:
>  	addfd->ret = fd;
>  
>  	if (addfd->ioctl_flags & SECCOMP_ADDFD_FLAG_SEND) {
> @@ -1080,6 +1104,9 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn
>  		}
>  	}
>  
> +	if (old_file)
> +		fput(old_file);
> +
>  	/*
>  	 * Mark the notification as completed. From this point, addfd mem
>  	 * might be invalidated and we can't safely read it anymore.
> @@ -1613,12 +1640,16 @@ static long seccomp_notify_addfd(struct seccomp_filter *filter,
>  	if (addfd.newfd_flags & ~O_CLOEXEC)
>  		return -EINVAL;
>  
> -	if (addfd.flags & ~(SECCOMP_ADDFD_FLAG_SETFD | SECCOMP_ADDFD_FLAG_SEND))
> +	if (addfd.flags & ~(SECCOMP_ADDFD_FLAG_SETFD | SECCOMP_ADDFD_FLAG_SEND |
> +			    SECCOMP_ADDFD_FLAG_REPLACE_REF))
>  		return -EINVAL;
>  
>  	if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD))
>  		return -EINVAL;
>  
> +	if (!addfd.newfd && (addfd.flags & SECCOMP_ADDFD_FLAG_REPLACE_REF))
> +		return -EINVAL;
> +
>  	kaddfd.file = fget(addfd.srcfd);
>  	if (!kaddfd.file)
>  		return -EBADF;
> diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c
> index 61386e499b77..3ece9407c6a9 100644
> --- a/tools/testing/selftests/seccomp/seccomp_bpf.c
> +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c
> @@ -47,6 +47,7 @@
>  #include <linux/kcmp.h>
>  #include <sys/resource.h>
>  #include <sys/capability.h>
> +#include <sys/epoll.h>
>  
>  #include <unistd.h>
>  #include <sys/syscall.h>
> @@ -4179,6 +4180,107 @@ TEST(user_notification_addfd)
>  	close(memfd);
>  }
>  
> +TEST(user_notification_addfd_with_epoll_replace)
> +{
> +	char c;
> +	pid_t pid;
> +	long ret;
> +	int optval;
> +	socklen_t optlen = sizeof(optval);
> +	int status, listener, fd;
> +	int efd, sfd[4];
> +	struct epoll_event e;
> +	struct seccomp_notif_addfd addfd = {};
> +	struct seccomp_notif req = {};
> +	struct seccomp_notif_resp resp = {};
> +
> +	ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0);
> +	ASSERT_EQ(0, ret) {
> +		TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!");
> +	}
> +
> +	listener = user_notif_syscall(__NR_getsockopt,
> +				      SECCOMP_FILTER_FLAG_NEW_LISTENER);
> +
> +	/* Create two socket pairs sfd[0] <-> sfd[1] and sfd[2] <-> sfd[3] */
> +	ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[2]), 0);
> +
> +	pid = fork();
> +	ASSERT_GE(pid, 0);
> +
> +	if (pid == 0) {
> +		if (socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[0]) != 0)
> +			exit(1);
> +
> +		efd = epoll_create(1);
> +		if (efd == -1)
> +			exit(1);
> +
> +		e.events = EPOLLIN;
> +		if (epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], &e) != 0)
> +			exit(1);
> +
> +		/*
> +		 * fd will be added here to replace an existing one linked
> +		 * in the epoll interface.
> +		 */
> +		if (getsockopt(sfd[0], SOL_SOCKET, SO_DOMAIN, &optval,
> +		       &optlen) != USER_NOTIF_MAGIC)
> +			exit(1);
> +
> +		/*
> +		 * Write data to the sfd[3] connected to sfd[2], but due to
> +		 * the swap, we should see data on sfd[0]
> +		 */
> +		if (write(sfd[3], "w", 1) != 1)
> +			exit(1);
> +
> +		if (epoll_wait(efd, &e, 1, 0) != 1)
> +			exit(1);
> +
> +		if (read(sfd[0], &c, 1) != 1)
> +			exit(1);
> +
> +		if ('w' != c)
> +			exit(1);
> +
> +		if (epoll_ctl(efd, EPOLL_CTL_DEL, sfd[0], &e) != 0)
> +			exit(1);
> +
> +		close(efd);
> +		close(sfd[0]);
> +		close(sfd[1]);
> +		close(sfd[2]);
> +		close(sfd[3]);
> +		exit(0);
> +	}
> +
> +	ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0);
> +
> +	addfd.srcfd = sfd[2];
> +	addfd.newfd = req.data.args[0];
> +	addfd.id = req.id;
> +	addfd.flags = SECCOMP_ADDFD_FLAG_SETFD | SECCOMP_ADDFD_FLAG_REPLACE_REF;
> +	addfd.newfd_flags = O_CLOEXEC;
> +
> +	/*
> +	 * Verfiy we can install and replace a file that is linked in the
> +	 * epoll interface. Replace the socket sfd[0] with sfd[2]
> +	 */
> +	fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd);
> +	EXPECT_EQ(fd, req.data.args[0]);
> +
> +	resp.id = req.id;
> +	resp.error = 0;
> +	resp.val = USER_NOTIF_MAGIC;
> +	EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0);
> +
> +	/* Wait for child to finish. */
> +	EXPECT_EQ(waitpid(pid, &status, 0), pid);
> +	EXPECT_EQ(true, WIFEXITED(status));
> +	EXPECT_EQ(0, WEXITSTATUS(status));
> +}
> +
>  TEST(user_notification_addfd_rlimit)
>  {
>  	pid_t pid;
> -- 
> 2.34.1
> 

thoughts on this?

- Alok 



[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux