Hi Mike, On Tue, Nov 16, 2021 at 3:57 PM Mina Almasry <almasrymina@xxxxxxxxxx> wrote: > > Currently in the is_continue case in hugetlb_mcopy_atomic_pte(), if we > bail out using "goto out_release_unlock;" in the cases where idx >= > size, or !huge_pte_none(), the code will detect that new_pagecache_page > == false, and so call restore_reserve_on_error(). > In this case I see restore_reserve_on_error() delete the reservation, > and the following call to remove_inode_hugepages() will increment > h->resv_hugepages causing a 100% reproducible leak. > Attached is the .c file with the 100% repro. > We should treat the is_continue case similar to adding a page into the > pagecache and set new_pagecache_page to true, to indicate that there is > no reservation to restore on the error path, and we need not call > restore_reserve_on_error(). > > Cc: Wei Xu <weixugc@xxxxxxxxxx> > > Fixes: c7b1850dfb41 ("hugetlb: don't pass page cache pages to restore_reserve_on_error") > Signed-off-by: Mina Almasry <almasrymina@xxxxxxxxxx> > Reported-by: James Houghton <jthoughton@xxxxxxxxxx> Not sure if this is a Cc: stable issue. If it is, I can add in v2. > --- > mm/hugetlb.c | 8 ++++++++ > 1 file changed, 8 insertions(+) > > diff --git a/mm/hugetlb.c b/mm/hugetlb.c > index e09159c957e3..25a7a3d84607 100644 > --- a/mm/hugetlb.c > +++ b/mm/hugetlb.c > @@ -5741,6 +5741,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, > page = find_lock_page(mapping, idx); > if (!page) > goto out; > + /* > + * Set new_pagecache_page to true, as we've added a page to the > + * pagecache, but userfaultfd hasn't set up a mapping for this > + * page yet. If we bail out before setting up the mapping, we > + * want to indicate to restore_reserve_on_error() that we've > + * added the page to the page cache. > + */ > + new_pagecache_page = true; > } else if (!*pagep) { > /* If a page already exists, then it's UFFDIO_COPY for > * a non-missing case. Return -EEXIST. > -- > 2.34.0.rc1.387.gb447b232ab-goog
#define _GNU_SOURCE #include <stdio.h> #include <stdlib.h> #include <unistd.h> #include <stdint.h> #include <string.h> #include <fcntl.h> #include <sys/syscall.h> #include <sys/mman.h> #include <sys/ioctl.h> #include <linux/userfaultfd.h> #ifndef UFFD_FEATURE_MINOR_HUGETLBFS #define UFFD_FEATURE_MINOR_HUGETLBFS (1 << 9) #endif #ifndef UFFDIO_REGISTER_MODE_MINOR #define UFFDIO_REGISTER_MODE_MINOR ((__u64)1 << 2) #endif #ifndef UFFDIO_CONTINUE_MODE_DONTWAKE #define UFFDIO_CONTINUE_MODE_DONTWAKE ((uint64_t)1 << 0) #endif struct uffdio_continue { struct uffdio_range range; uint64_t mode; int64_t bytes_mapped; }; #ifndef UFFDIO_CONTINUE #define _UFFDIO_CONTINUE (0x07) #define UFFDIO_CONTINUE _IOWR(UFFDIO, _UFFDIO_CONTINUE, struct uffdio_continue) #endif #ifndef UFFD_PAGEFAULT_FLAG_MINOR #define UFFD_PAGEFAULT_FLAG_MINOR (1 << 2) #endif int userfaultfd(int mode) { return syscall(__NR_userfaultfd, mode); } int main() { int fd = open("/mnt/huge/test", O_RDWR | O_CREAT | S_IRUSR | S_IWUSR); if (fd == -1) { perror("opening tmpfile in hugetlbfs-mount failed"); return -1; } const size_t len_hugepage = 512 * 4096; // 2MB const size_t len = 2 * len_hugepage; if (ftruncate(fd, len) < 0) { perror("ftruncate failed"); return -1; } int uffd = userfaultfd(O_CLOEXEC | O_NONBLOCK); if (uffd < 0) { perror("uffd not created"); return -1; } char *hva = (char*)(mmap(NULL, len, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)); if (hva == MAP_FAILED) { perror("mmap hva failed"); return -1; } char *primary = (char*)(mmap(hva, len, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); if (primary == MAP_FAILED) { perror("mmap primary failed"); return -1; } char *secondary = (char*)(mmap(hva, len, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); if (primary == MAP_FAILED) { perror("mmap secondary failed"); return -1; } struct uffdio_api api; api.api = UFFD_API; api.features = UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MISSING_HUGETLBFS; if (ioctl(uffd, UFFDIO_API, &api) == -1) { perror("UFFDIO_API failed"); return -1; } memset(primary + len_hugepage, 0, len_hugepage); struct uffdio_register reg; reg.range.start = (unsigned long)primary; reg.range.len = len; reg.mode = UFFDIO_REGISTER_MODE_MINOR | UFFDIO_REGISTER_MODE_MISSING; reg.ioctls = 0; if (ioctl(uffd, UFFDIO_REGISTER, ®) == -1) { perror("register failed"); return -1; } memset(secondary, 0, len); struct uffdio_continue cont = { .range = (struct uffdio_range) { .start = (uint64_t)primary, .len = len, }, .mode = 0, .bytes_mapped = 0, }; if (ioctl(uffd, UFFDIO_CONTINUE, &cont) < 0) { perror("UFFDIO_CONTINUE failed"); printf("Mapped %d bytes\n", cont.bytes_mapped); } close(uffd); munmap(secondary, len); munmap(primary, len); munmap(hva, len); close(fd); // After this unlink, a hugepage will be leaked. if (unlink("/mnt/huge/test") < 0) { perror("unlink failed"); } return 0; }