Re: [PATCH v1] hugetlb, userfaultfd: Fix reservation restore on userfaultfd error

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Mike,

On Tue, Nov 16, 2021 at 3:57 PM Mina Almasry <almasrymina@xxxxxxxxxx> wrote:
>
> Currently in the is_continue case in hugetlb_mcopy_atomic_pte(), if we
> bail out using "goto out_release_unlock;" in the cases where idx >=
> size, or !huge_pte_none(), the code will detect that new_pagecache_page
> == false, and so call restore_reserve_on_error().
> In this case I see restore_reserve_on_error() delete the reservation,
> and the following call to remove_inode_hugepages() will increment
> h->resv_hugepages causing a 100% reproducible leak.
>

Attached is the .c file with the 100% repro.

> We should treat the is_continue case similar to adding a page into the
> pagecache and set new_pagecache_page to true, to indicate that there is
> no reservation to restore on the error path, and we need not call
> restore_reserve_on_error().
>
> Cc: Wei Xu <weixugc@xxxxxxxxxx>
>
> Fixes: c7b1850dfb41 ("hugetlb: don't pass page cache pages to restore_reserve_on_error")
> Signed-off-by: Mina Almasry <almasrymina@xxxxxxxxxx>
> Reported-by: James Houghton <jthoughton@xxxxxxxxxx>

Not sure if this is a Cc: stable issue. If it is, I can add in v2.

> ---
>  mm/hugetlb.c | 8 ++++++++
>  1 file changed, 8 insertions(+)
>
> diff --git a/mm/hugetlb.c b/mm/hugetlb.c
> index e09159c957e3..25a7a3d84607 100644
> --- a/mm/hugetlb.c
> +++ b/mm/hugetlb.c
> @@ -5741,6 +5741,14 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
>                 page = find_lock_page(mapping, idx);
>                 if (!page)
>                         goto out;
> +               /*
> +                * Set new_pagecache_page to true, as we've added a page to the
> +                * pagecache, but userfaultfd hasn't set up a mapping for this
> +                * page yet. If we bail out before setting up the mapping, we
> +                * want to indicate to restore_reserve_on_error() that we've
> +                * added the page to the page cache.
> +                */
> +               new_pagecache_page = true;
>         } else if (!*pagep) {
>                 /* If a page already exists, then it's UFFDIO_COPY for
>                  * a non-missing case. Return -EEXIST.
> --
> 2.34.0.rc1.387.gb447b232ab-goog
#define _GNU_SOURCE

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdint.h>
#include <string.h>
#include <fcntl.h>
#include <sys/syscall.h>
#include <sys/mman.h>
#include <sys/ioctl.h>
#include <linux/userfaultfd.h>

#ifndef UFFD_FEATURE_MINOR_HUGETLBFS
#define UFFD_FEATURE_MINOR_HUGETLBFS (1 << 9)
#endif

#ifndef UFFDIO_REGISTER_MODE_MINOR
#define UFFDIO_REGISTER_MODE_MINOR ((__u64)1 << 2)
#endif

#ifndef UFFDIO_CONTINUE_MODE_DONTWAKE
#define UFFDIO_CONTINUE_MODE_DONTWAKE ((uint64_t)1 << 0)
#endif

struct uffdio_continue {
	struct uffdio_range range;
	uint64_t mode;
	int64_t bytes_mapped;
};


#ifndef UFFDIO_CONTINUE
#define _UFFDIO_CONTINUE (0x07)
#define UFFDIO_CONTINUE _IOWR(UFFDIO, _UFFDIO_CONTINUE, struct uffdio_continue)
#endif

#ifndef UFFD_PAGEFAULT_FLAG_MINOR
#define UFFD_PAGEFAULT_FLAG_MINOR (1 << 2)
#endif

int userfaultfd(int mode) {
	return syscall(__NR_userfaultfd, mode);
}

int main() {

	int fd = open("/mnt/huge/test", O_RDWR | O_CREAT | S_IRUSR | S_IWUSR);
	if (fd == -1) {
		perror("opening tmpfile in hugetlbfs-mount failed");
		return -1;
	}

	const size_t len_hugepage = 512 * 4096; // 2MB
	const size_t len = 2 * len_hugepage;
	if (ftruncate(fd, len) < 0) {
		perror("ftruncate failed");
		return -1;
	}
	int uffd = userfaultfd(O_CLOEXEC | O_NONBLOCK);
	if (uffd < 0) {
		perror("uffd not created");
		return -1;
	}

	char *hva = (char*)(mmap(NULL, len, PROT_NONE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0));
	if (hva == MAP_FAILED) {
		perror("mmap hva failed");
		return -1;
	}

	char *primary = (char*)(mmap(hva, len, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
	if (primary == MAP_FAILED) {
		perror("mmap primary failed");
		return -1;
	}

	char *secondary = (char*)(mmap(hva, len, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
	if (primary == MAP_FAILED) {
		perror("mmap secondary failed");
		return -1;
	}

	struct uffdio_api api;
	api.api = UFFD_API;
	api.features = UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MISSING_HUGETLBFS;
	if (ioctl(uffd, UFFDIO_API, &api) == -1) {
		perror("UFFDIO_API failed");
		return -1;
	}

	memset(primary + len_hugepage, 0, len_hugepage);

	struct uffdio_register reg;
	reg.range.start = (unsigned long)primary;
	reg.range.len = len;
	reg.mode = UFFDIO_REGISTER_MODE_MINOR | UFFDIO_REGISTER_MODE_MISSING;
	reg.ioctls = 0;
	if (ioctl(uffd, UFFDIO_REGISTER, &reg) == -1) {
		perror("register failed");
		return -1;
	}

	memset(secondary, 0, len);

	struct uffdio_continue cont = {
		.range = (struct uffdio_range) {
			.start = (uint64_t)primary,
			.len = len,
		},
		.mode = 0,
		.bytes_mapped = 0,
	};
	if (ioctl(uffd, UFFDIO_CONTINUE, &cont) < 0) {
		perror("UFFDIO_CONTINUE failed");
		printf("Mapped %d bytes\n", cont.bytes_mapped);
	}

	close(uffd);
	munmap(secondary, len);
	munmap(primary, len);
	munmap(hva, len);

	close(fd);

	// After this unlink, a hugepage will be leaked.
	if (unlink("/mnt/huge/test") < 0) {
		perror("unlink failed");
	}

	return 0;
}

[Index of Archives]     [Linux ARM Kernel]     [Linux ARM]     [Linux Omap]     [Fedora ARM]     [IETF Annouce]     [Bugtraq]     [Linux OMAP]     [Linux MIPS]     [eCos]     [Asterisk Internet PBX]     [Linux API]

  Powered by Linux