On Tue, Jun 14, 2022 at 12:09 PM Peter Xu <peterx@xxxxxxxxxx> wrote: > > On Wed, Jun 01, 2022 at 02:09:47PM -0700, Axel Rasmussen wrote: > > Historically, it has been shown that intercepting kernel faults with > > userfaultfd (thereby forcing the kernel to wait for an arbitrary amount > > of time) can be exploited, or at least can make some kinds of exploits > > easier. So, in 37cd0575b8 "userfaultfd: add UFFD_USER_MODE_ONLY" we > > changed things so, in order for kernel faults to be handled by > > userfaultfd, either the process needs CAP_SYS_PTRACE, or this sysctl > > must be configured so that any unprivileged user can do it. > > > > In a typical implementation of a hypervisor with live migration (take > > QEMU/KVM as one such example), we do indeed need to be able to handle > > kernel faults. But, both options above are less than ideal: > > > > - Toggling the sysctl increases attack surface by allowing any > > unprivileged user to do it. > > > > - Granting the live migration process CAP_SYS_PTRACE gives it this > > ability, but *also* the ability to "observe and control the > > execution of another process [...], and examine and change [its] > > memory and registers" (from ptrace(2)). This isn't something we need > > or want to be able to do, so granting this permission violates the > > "principle of least privilege". > > > > This is all a long winded way to say: we want a more fine-grained way to > > grant access to userfaultfd, without granting other additional > > permissions at the same time. > > > > To achieve this, add a /dev/userfaultfd misc device. This device > > provides an alternative to the userfaultfd(2) syscall for the creation > > of new userfaultfds. The idea is, any userfaultfds created this way will > > be able to handle kernel faults, without the caller having any special > > capabilities. Access to this mechanism is instead restricted using e.g. > > standard filesystem permissions. > > > > Signed-off-by: Axel Rasmussen <axelrasmussen@xxxxxxxxxx> > > --- > > fs/userfaultfd.c | 76 ++++++++++++++++++++++++++------ > > include/uapi/linux/userfaultfd.h | 4 ++ > > 2 files changed, 66 insertions(+), 14 deletions(-) > > > > diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c > > index e943370107d0..8b92c1398169 100644 > > --- a/fs/userfaultfd.c > > +++ b/fs/userfaultfd.c > > @@ -30,6 +30,7 @@ > > #include <linux/security.h> > > #include <linux/hugetlb.h> > > #include <linux/swapops.h> > > +#include <linux/miscdevice.h> > > > > int sysctl_unprivileged_userfaultfd __read_mostly; > > > > @@ -413,13 +414,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) > > > > if (ctx->features & UFFD_FEATURE_SIGBUS) > > goto out; > > - if ((vmf->flags & FAULT_FLAG_USER) == 0 && > > - ctx->flags & UFFD_USER_MODE_ONLY) { > > - printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd " > > - "sysctl knob to 1 if kernel faults must be handled " > > - "without obtaining CAP_SYS_PTRACE capability\n"); > > + if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY)) > > goto out; > > - } > > > > /* > > * If it's already released don't get it. This avoids to loop > > @@ -2052,19 +2048,33 @@ static void init_once_userfaultfd_ctx(void *mem) > > seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock); > > } > > > > -SYSCALL_DEFINE1(userfaultfd, int, flags) > > +static inline bool userfaultfd_allowed(bool is_syscall, int flags) > > +{ > > + bool kernel_faults = !(flags & UFFD_USER_MODE_ONLY); > > + bool allow_unprivileged = sysctl_unprivileged_userfaultfd; > > + > > + /* userfaultfd(2) access is controlled by sysctl + capability. */ > > + if (is_syscall && kernel_faults) { > > + if (!allow_unprivileged && !capable(CAP_SYS_PTRACE)) > > + return false; > > + } > > + > > + /* > > + * For /dev/userfaultfd, access is to be controlled using e.g. > > + * permissions on the device node. We assume this is correctly > > + * configured by userspace, so we simply allow access here. > > + */ > > + > > + return true; > > +} > > This helper reads a bit weird because potentially it constantly returns > "true" for !syscall use case but it's very not obvious.. > > Would it be cleaner to not pass in the bool at all? Something like (I also > un-nested some of the condition checks, hopefully it'll be easier to read): > > bool userfaultfd_syscall_allowed(int flags) > { > /* Userspace-only page faults are always allowed */ > if (flags & UFFD_USER_MODE_ONLY) > return true; > > /* > * The user is requesting kernel fault capabilities. Privileged > * users are always allowed even for kernel fault traps. > */ > if (capable(CAP_SYS_PTRACE)) > return true; > > /* Whether we allow unprivileged users for kernel faults? */ > return sysctl_unprivileged_userfaultfd; > } > > Then below... > > > + > > +static int new_userfaultfd(bool is_syscall, int flags) > > { > > struct userfaultfd_ctx *ctx; > > int fd; > > > > - if (!sysctl_unprivileged_userfaultfd && > > - (flags & UFFD_USER_MODE_ONLY) == 0 && > > - !capable(CAP_SYS_PTRACE)) { > > - printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd " > > - "sysctl knob to 1 if kernel faults must be handled " > > - "without obtaining CAP_SYS_PTRACE capability\n"); > > + if (!userfaultfd_allowed(is_syscall, flags)) > > return -EPERM; > > - } > > .. we could write it as: > > if (is_syscall && !userfaultfd_syscall_allowed(flags)) > return -EPERM; > > What do you think? > > > > > BUG_ON(!current->mm); > > > > @@ -2083,6 +2093,10 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) > > refcount_set(&ctx->refcount, 1); > > ctx->flags = flags; > > ctx->features = 0; > > + /* > > + * If UFFD_USER_MODE_ONLY is not set, then userfaultfd_allowed() above > > + * decided that kernel faults were allowed and should be handled. > > + */ > > Hmm.. why this needs to be added above "released=false"? Did you want to > add this (perhaps) above "flags" instead? > > IMHO when people reading the flags it'll be clear already on how it was > handled, the thing is the comment probably hide deep anyway so I'd consider > omitting it. > > The rest looks good to me, thanks. Thanks for reviewing, Peter! Most of these comments look good to me, I'll include them in a v4 after I get back to the office in about a week. > > > ctx->released = false; > > atomic_set(&ctx->mmap_changing, 0); > > ctx->mm = current->mm; > > @@ -2098,8 +2112,42 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) > > return fd; > > } > > > > +SYSCALL_DEFINE1(userfaultfd, int, flags) > > +{ > > + return new_userfaultfd(true, flags); > > +} > > + > > +static int userfaultfd_dev_open(struct inode *inode, struct file *file) > > +{ > > + return 0; > > +} > > + > > +static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags) > > +{ > > + if (cmd != USERFAULTFD_IOC_NEW) > > + return -EINVAL; > > + > > + return new_userfaultfd(false, flags); > > +} > > + > > +static const struct file_operations userfaultfd_dev_fops = { > > + .open = userfaultfd_dev_open, > > + .unlocked_ioctl = userfaultfd_dev_ioctl, > > + .compat_ioctl = userfaultfd_dev_ioctl, > > + .owner = THIS_MODULE, > > + .llseek = noop_llseek, > > +}; > > + > > +static struct miscdevice userfaultfd_misc = { > > + .minor = MISC_DYNAMIC_MINOR, > > + .name = "userfaultfd", > > + .fops = &userfaultfd_dev_fops > > +}; > > + > > static int __init userfaultfd_init(void) > > { > > + WARN_ON(misc_register(&userfaultfd_misc)); > > + > > userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache", > > sizeof(struct userfaultfd_ctx), > > 0, > > diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h > > index 7d32b1e797fb..005e5e306266 100644 > > --- a/include/uapi/linux/userfaultfd.h > > +++ b/include/uapi/linux/userfaultfd.h > > @@ -12,6 +12,10 @@ > > > > #include <linux/types.h> > > > > +/* ioctls for /dev/userfaultfd */ > > +#define USERFAULTFD_IOC 0xAA > > +#define USERFAULTFD_IOC_NEW _IO(USERFAULTFD_IOC, 0x00) > > + > > /* > > * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and > > * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR. In > > -- > > 2.36.1.255.ge46751e96f-goog > > > > -- > Peter Xu >