On 09/21/22 17:57, John Hubbard wrote: > On 9/21/22 16:57, Mike Kravetz wrote: > > On 09/21/22 10:48, Mike Kravetz wrote: > >> On 09/21/22 16:34, Liu Shixin wrote: > >>> The vma_lock and hugetlb_fault_mutex are dropped before handling > >>> userfault and reacquire them again after handle_userfault(), but > >>> reacquire the vma_lock could lead to UAF[1] due to the following > >>> race, > >>> > >>> hugetlb_fault > >>> hugetlb_no_page > >>> /*unlock vma_lock */ > >>> hugetlb_handle_userfault > >>> handle_userfault > >>> /* unlock mm->mmap_lock*/ > >>> vm_mmap_pgoff > >>> do_mmap > >>> mmap_region > >>> munmap_vma_range > >>> /* clean old vma */ > >>> /* lock vma_lock again <--- UAF */ > >>> /* unlock vma_lock */ > >>> > >>> Since the vma_lock will unlock immediately after hugetlb_handle_userfault(), > >>> let's drop the unneeded lock and unlock in hugetlb_handle_userfault() to fix > >>> the issue. > >> > >> Thank you very much! > >> > >> When I saw this report, the obvious fix was to do something like what you have > >> done below. That looks fine with a few minor comments. > >> > >> One question I have not yet answered is, "Does this same issue apply to > >> follow_hugetlb_page()?". I believe it does. follow_hugetlb_page calls > >> hugetlb_fault which could result in the fault being processed by userfaultfd. > >> If we experience the race above, then the associated vma could no longer be > >> valid when returning from hugetlb_fault. follow_hugetlb_page and callers > >> have a flag (locked) to deal with dropping mmap lock. However, I am not sure > >> if it is handled correctly WRT userfaultfd. I think this needs to be answered > >> before fixing. And, if the follow_hugetlb_page code needs to be fixed it > >> should be done at the same time. > >> > > > > To at least verify this code path, I added userfaultfd handling to the gup_test > > program in kernel selftests. When doing basic gup test on a hugetlb page in > > Just for those of us who are easily confused by userfaultfd cases, can you show > what that patch is? It would help me understand this a little faster. The ugly (just throw code at it to make it work) diff is below. All I did was add the sample code from the userfaultfd man page. With that change in place, I just run 'gup_test -U -z -m 2 -H' > Actually I'm expecting that Peter can easily answer this whole thing. :) diff --git a/tools/testing/selftests/vm/gup_test.c b/tools/testing/selftests/vm/gup_test.c index e43879291dac..491424d0a039 100644 --- a/tools/testing/selftests/vm/gup_test.c +++ b/tools/testing/selftests/vm/gup_test.c @@ -8,8 +8,11 @@ #include <sys/mman.h> #include <sys/stat.h> #include <sys/types.h> +#include <sys/syscall.h> #include <pthread.h> #include <assert.h> +#include <poll.h> +#include <linux/userfaultfd.h> #include <mm/gup_test.h> #include "../kselftest.h" @@ -48,6 +51,75 @@ static char *cmd_to_str(unsigned long cmd) return "Unknown command"; } +static void *fault_handler_thread(void *arg) +{ + static struct uffd_msg msg; /* Data read from userfaultfd */ + static int fault_cnt = 0; /* Number of faults so far handled */ + long uffd; + static char *page = NULL; + struct uffdio_copy uffdio_copy; + ssize_t nread; + size_t page_size = 2 * 1024 * 1024; + + uffd = (long) arg; + + if (page == NULL) { + page = mmap(NULL, page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (page == MAP_FAILED) + exit(1); + } + + for (;;) { + struct pollfd pollfd; + int nready; + pollfd.fd = uffd; + pollfd.events = POLLIN; + nready = poll(&pollfd, 1, -1); + if (nready == -1) + exit(1); + + printf("\nfault_handler_thread():\n"); + printf(" poll() returns: nready = %d; " + "POLLIN = %d; POLLERR = %d\n", nready, + (pollfd.revents & POLLIN) != 0, + (pollfd.revents & POLLERR) != 0); + + nread = read(uffd, &msg, sizeof(msg)); + if (nread == 0) { + printf("EOF on userfaultfd!\n"); + exit(1); + } + + if (nread == -1) { + printf("nread == -1\n"); + exit(1); + } + + if (msg.event != UFFD_EVENT_PAGEFAULT) { + printf("Unexpected event on userfaultfd\n"); + exit(1); + } + + printf(" UFFD_EVENT_PAGEFAULT event: "); + printf("flags = %llx; ", msg.arg.pagefault.flags); + printf("address = %llx\n", msg.arg.pagefault.address); + + memset(page, 'A' + fault_cnt % 20, page_size); + fault_cnt++; + + uffdio_copy.src = (unsigned long) page; + uffdio_copy.dst = (unsigned long) msg.arg.pagefault.address & + ~(page_size - 1); + uffdio_copy.len = page_size; + uffdio_copy.mode = 0; + uffdio_copy.copy = 0; + if (ioctl(uffd, UFFDIO_COPY, &uffdio_copy) == -1) + exit(1); + + } +} + void *gup_thread(void *data) { struct gup_test gup = *(struct gup_test *)data; @@ -94,7 +166,11 @@ int main(int argc, char **argv) int flags = MAP_PRIVATE, touch = 0; char *file = "/dev/zero"; pthread_t *tid; + pthread_t thr; char *p; + long uffd; + struct uffdio_api uffdio_api; + struct uffdio_register uffdio_register; while ((opt = getopt(argc, argv, "m:r:n:F:f:abcj:tTLUuwWSHpz")) != -1) { switch (opt) { @@ -230,6 +306,18 @@ int main(int argc, char **argv) exit(KSFT_SKIP); } + uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK); + if (uffd == -1) { + perror("uffd open"); + exit(1); + } + uffdio_api.api = UFFD_API; + uffdio_api.features = 0; + if (ioctl(uffd, UFFDIO_API, &uffdio_api) == -1) { + perror("uffd ioctl API"); + exit(1); + } + p = mmap(NULL, size, PROT_READ | PROT_WRITE, flags, filed, 0); if (p == MAP_FAILED) { perror("mmap"); @@ -237,6 +325,20 @@ int main(int argc, char **argv) } gup.addr = (unsigned long)p; + uffdio_register.range.start = (unsigned long)p; + uffdio_register.range.len = size; + uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; + if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) == -1) { + perror("uffd ioctl API"); + exit(1); + } + ret = pthread_create(&thr, NULL, fault_handler_thread, (void *)uffd); + if (ret) { + exit(1); + } + + sleep(5); + if (thp == 1) madvise(p, size, MADV_HUGEPAGE); else if (thp == 0) -- Mike Kravetz