On Sat, Apr 29, 2023 at 05:49:55AM +0000, aloktiagi wrote: > Introduce a mechanism to replace a file linked in the epoll interface by a new > file injected by the syscall supervisor by using the epoll provided > eventpoll_replace_file() api. > > Also introduce a new addfd flag SECCOMP_ADDFD_FLAG_REPLACE_REF to allow the supervisor > to indicate that it is interested in getting the original file replaced by the > new injected file. > > We have a use case where multiple IPv6 only network namespaces can use a single > IPv4 network namespace for IPv4 only egress connectivity by switching their > sockets from IPv6 to IPv4 network namespace. This allows for migration of > systems to IPv6 only while keeping their connectivity to IPv4 only destinations > intact. > > Today, we achieve this by setting up seccomp filter to intercept network system > calls like connect() from a container in a syscall supervisor which runs in an > IPv4 only network namespace. The syscall supervisor creates a new IPv4 connection > and injects the new file descriptor through SECCOMP_NOTIFY_IOCTL_ADDFD replacing > the original file descriptor from the connect() call. This does not work for > cases where the original file descriptor is handed off to a system like epoll > before the connect() call. After a new file descriptor is injected the original > file descriptor being referenced by the epoll fd is not longer valid leading to > failures. As a workaround the syscall supervisor when intercepting connect() > loops through all open socket file descriptors to check if they are referencing > the socket attempting the connect() and replace the reference with the to be > injected file descriptor. This workaround is cumbersome and makes the solution > prone to similar yet to be discovered issues. > > The above change will enable us remove the workaround in the syscall supervisor > and let the kernel handle the replacement correctly. > > Signed-off-by: aloktiagi <aloktiagi@xxxxxxxxx> > --- > include/uapi/linux/seccomp.h | 1 + > kernel/seccomp.c | 35 +++++- > tools/testing/selftests/seccomp/seccomp_bpf.c | 102 ++++++++++++++++++ > 3 files changed, 136 insertions(+), 2 deletions(-) > > diff --git a/include/uapi/linux/seccomp.h b/include/uapi/linux/seccomp.h > index 0fdc6ef02b94..0a74dc5d967f 100644 > --- a/include/uapi/linux/seccomp.h > +++ b/include/uapi/linux/seccomp.h > @@ -118,6 +118,7 @@ struct seccomp_notif_resp { > /* valid flags for seccomp_notif_addfd */ > #define SECCOMP_ADDFD_FLAG_SETFD (1UL << 0) /* Specify remote fd */ > #define SECCOMP_ADDFD_FLAG_SEND (1UL << 1) /* Addfd and return it, atomically */ > +#define SECCOMP_ADDFD_FLAG_REPLACE_REF (1UL << 2) /* Update replace references */ > > /** > * struct seccomp_notif_addfd > diff --git a/kernel/seccomp.c b/kernel/seccomp.c > index cebf26445f9e..5b1b265b30d9 100644 > --- a/kernel/seccomp.c > +++ b/kernel/seccomp.c > @@ -19,6 +19,7 @@ > #include <linux/audit.h> > #include <linux/compat.h> > #include <linux/coredump.h> > +#include <linux/eventpoll.h> > #include <linux/kmemleak.h> > #include <linux/nospec.h> > #include <linux/prctl.h> > @@ -1056,6 +1057,7 @@ static u64 seccomp_next_notify_id(struct seccomp_filter *filter) > static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_knotif *n) > { > int fd; > + struct file *old_file = NULL; > > /* > * Remove the notification, and reset the list pointers, indicating > @@ -1064,8 +1066,30 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn > list_del_init(&addfd->list); > if (!addfd->setfd) > fd = receive_fd(addfd->file, addfd->flags); > - else > + else { > + int ret = 0; > + if (addfd->ioctl_flags & SECCOMP_ADDFD_FLAG_REPLACE_REF) { > + old_file = fget(addfd->fd); > + if (!old_file) { > + fd = -EBADF; > + goto error; > + } > + ret = eventpoll_replace_file(old_file, addfd->file, addfd->fd); > + if (ret < 0) { > + fd = ret; > + goto error; > + } > + } > fd = receive_fd_replace(addfd->fd, addfd->file, addfd->flags); > + /* In case of error restore all references */ > + if (fd < 0 && addfd->ioctl_flags & SECCOMP_ADDFD_FLAG_REPLACE_REF) { > + ret = eventpoll_replace_file(addfd->file, old_file, addfd->fd); > + if (ret < 0) { > + fd = ret; > + } > + } > + } > +error: > addfd->ret = fd; > > if (addfd->ioctl_flags & SECCOMP_ADDFD_FLAG_SEND) { > @@ -1080,6 +1104,9 @@ static void seccomp_handle_addfd(struct seccomp_kaddfd *addfd, struct seccomp_kn > } > } > > + if (old_file) > + fput(old_file); > + > /* > * Mark the notification as completed. From this point, addfd mem > * might be invalidated and we can't safely read it anymore. > @@ -1613,12 +1640,16 @@ static long seccomp_notify_addfd(struct seccomp_filter *filter, > if (addfd.newfd_flags & ~O_CLOEXEC) > return -EINVAL; > > - if (addfd.flags & ~(SECCOMP_ADDFD_FLAG_SETFD | SECCOMP_ADDFD_FLAG_SEND)) > + if (addfd.flags & ~(SECCOMP_ADDFD_FLAG_SETFD | SECCOMP_ADDFD_FLAG_SEND | > + SECCOMP_ADDFD_FLAG_REPLACE_REF)) > return -EINVAL; > > if (addfd.newfd && !(addfd.flags & SECCOMP_ADDFD_FLAG_SETFD)) > return -EINVAL; > > + if (!addfd.newfd && (addfd.flags & SECCOMP_ADDFD_FLAG_REPLACE_REF)) > + return -EINVAL; > + > kaddfd.file = fget(addfd.srcfd); > if (!kaddfd.file) > return -EBADF; > diff --git a/tools/testing/selftests/seccomp/seccomp_bpf.c b/tools/testing/selftests/seccomp/seccomp_bpf.c > index 61386e499b77..3ece9407c6a9 100644 > --- a/tools/testing/selftests/seccomp/seccomp_bpf.c > +++ b/tools/testing/selftests/seccomp/seccomp_bpf.c > @@ -47,6 +47,7 @@ > #include <linux/kcmp.h> > #include <sys/resource.h> > #include <sys/capability.h> > +#include <sys/epoll.h> > > #include <unistd.h> > #include <sys/syscall.h> > @@ -4179,6 +4180,107 @@ TEST(user_notification_addfd) > close(memfd); > } > > +TEST(user_notification_addfd_with_epoll_replace) > +{ > + char c; > + pid_t pid; > + long ret; > + int optval; > + socklen_t optlen = sizeof(optval); > + int status, listener, fd; > + int efd, sfd[4]; > + struct epoll_event e; > + struct seccomp_notif_addfd addfd = {}; > + struct seccomp_notif req = {}; > + struct seccomp_notif_resp resp = {}; > + > + ret = prctl(PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); > + ASSERT_EQ(0, ret) { > + TH_LOG("Kernel does not support PR_SET_NO_NEW_PRIVS!"); > + } > + > + listener = user_notif_syscall(__NR_getsockopt, > + SECCOMP_FILTER_FLAG_NEW_LISTENER); > + > + /* Create two socket pairs sfd[0] <-> sfd[1] and sfd[2] <-> sfd[3] */ > + ASSERT_EQ(socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[2]), 0); > + > + pid = fork(); > + ASSERT_GE(pid, 0); > + > + if (pid == 0) { > + if (socketpair(AF_UNIX, SOCK_STREAM, 0, &sfd[0]) != 0) > + exit(1); > + > + efd = epoll_create(1); > + if (efd == -1) > + exit(1); > + > + e.events = EPOLLIN; > + if (epoll_ctl(efd, EPOLL_CTL_ADD, sfd[0], &e) != 0) > + exit(1); > + > + /* > + * fd will be added here to replace an existing one linked > + * in the epoll interface. > + */ > + if (getsockopt(sfd[0], SOL_SOCKET, SO_DOMAIN, &optval, > + &optlen) != USER_NOTIF_MAGIC) > + exit(1); > + > + /* > + * Write data to the sfd[3] connected to sfd[2], but due to > + * the swap, we should see data on sfd[0] > + */ > + if (write(sfd[3], "w", 1) != 1) > + exit(1); > + > + if (epoll_wait(efd, &e, 1, 0) != 1) > + exit(1); > + > + if (read(sfd[0], &c, 1) != 1) > + exit(1); > + > + if ('w' != c) > + exit(1); > + > + if (epoll_ctl(efd, EPOLL_CTL_DEL, sfd[0], &e) != 0) > + exit(1); > + > + close(efd); > + close(sfd[0]); > + close(sfd[1]); > + close(sfd[2]); > + close(sfd[3]); > + exit(0); > + } > + > + ASSERT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_RECV, &req), 0); > + > + addfd.srcfd = sfd[2]; > + addfd.newfd = req.data.args[0]; > + addfd.id = req.id; > + addfd.flags = SECCOMP_ADDFD_FLAG_SETFD | SECCOMP_ADDFD_FLAG_REPLACE_REF; > + addfd.newfd_flags = O_CLOEXEC; > + > + /* > + * Verfiy we can install and replace a file that is linked in the > + * epoll interface. Replace the socket sfd[0] with sfd[2] > + */ > + fd = ioctl(listener, SECCOMP_IOCTL_NOTIF_ADDFD, &addfd); > + EXPECT_EQ(fd, req.data.args[0]); > + > + resp.id = req.id; > + resp.error = 0; > + resp.val = USER_NOTIF_MAGIC; > + EXPECT_EQ(ioctl(listener, SECCOMP_IOCTL_NOTIF_SEND, &resp), 0); > + > + /* Wait for child to finish. */ > + EXPECT_EQ(waitpid(pid, &status, 0), pid); > + EXPECT_EQ(true, WIFEXITED(status)); > + EXPECT_EQ(0, WEXITSTATUS(status)); > +} > + > TEST(user_notification_addfd_rlimit) > { > pid_t pid; > -- > 2.34.1 > thoughts on this? - Alok