On Wed, Jul 12, 2023 at 6:42 AM Hugh Dickins <hughd@xxxxxxxxxx> wrote: > Bring collapse_and_free_pmd() back into collapse_pte_mapped_thp(). > It does need mmap_read_lock(), but it does not need mmap_write_lock(), > nor vma_start_write() nor i_mmap lock nor anon_vma lock. All racing > paths are relying on pte_offset_map_lock() and pmd_lock(), so use those. We can still have a racing userfaultfd operation at the "/* step 4: remove page table */" point that installs a new PTE before the page table is removed. To reproduce, patch a delay into the kernel like this: diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 9a6e0d507759..27cc8dfbf3a7 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -20,6 +20,7 @@ #include <linux/swapops.h> #include <linux/shmem_fs.h> #include <linux/ksm.h> +#include <linux/delay.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -1617,6 +1618,11 @@ int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, } /* step 4: remove page table */ + if (strcmp(current->comm, "DELAYME") == 0) { + pr_warn("%s: BEGIN DELAY INJECTION\n", __func__); + mdelay(5000); + pr_warn("%s: END DELAY INJECTION\n", __func__); + } /* Huge page lock is still held, so page table must remain empty */ pml = pmd_lock(mm, pmd); And then run the attached reproducer against mm/mm-everything. You should get this in dmesg: [ 206.578096] BUG: Bad rss-counter state mm:000000000942ebea type:MM_ANONPAGES val:1
// compile with "gcc -o khugepaged-vs-uffd khugepaged-vs-uffd.c -pthread" #define _GNU_SOURCE #include <pthread.h> #include <err.h> #include <sched.h> #include <stdio.h> #include <string.h> #include <fcntl.h> #include <signal.h> #include <stdlib.h> #include <unistd.h> #include <sys/syscall.h> #include <sys/stat.h> #include <sys/prctl.h> #include <sys/mount.h> #include <sys/mman.h> #include <sys/ioctl.h> #include <linux/userfaultfd.h> #ifndef MADV_COLLAPSE #define MADV_COLLAPSE 25 #endif #ifndef UFFD_USER_MODE_ONLY #define UFFD_USER_MODE_ONLY 1 #endif #define SYSCHK(x) ({ \ typeof(x) __res = (x); \ if (__res == (typeof(x))-1) \ err(1, "SYSCHK(" #x ")"); \ __res; \ }) static void write_file(char *name, char *buf) { int fd = SYSCHK(open(name, O_WRONLY)); if (write(fd, buf, strlen(buf)) != strlen(buf)) err(1, "write %s", name); close(fd); } static void write_map(char *name, int outer_id) { char buf[100]; sprintf(buf, "0 %d 1", outer_id); write_file(name, buf); } static void *thread_fn(void *dummy) { system("head -n50 /proc/$PPID/smaps;echo;echo"); SYSCHK(prctl(PR_SET_NAME, "DELAYME")); SYSCHK(madvise((void*)0x200000UL, 0x200000, MADV_COLLAPSE)); SYSCHK(prctl(PR_SET_NAME, "thread")); system("head -n50 /proc/$PPID/smaps"); return NULL; } int main(void) { int outer_uid = getuid(); int outer_gid = getgid(); SYSCHK(unshare(CLONE_NEWNS|CLONE_NEWUSER)); SYSCHK(mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL)); write_file("/proc/self/setgroups", "deny"); write_map("/proc/self/uid_map", outer_uid); write_map("/proc/self/gid_map", outer_gid); SYSCHK(mount("none", "/tmp", "tmpfs", MS_NOSUID|MS_NODEV, "huge=always")); int fd = SYSCHK(open("/tmp/a", O_RDWR|O_CREAT, 0600)); SYSCHK(ftruncate(fd, 0x200000)); void *ptr = SYSCHK(mmap((void*)0x200000UL, 0x100000, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED_NOREPLACE, fd, 0)); *(volatile char *)ptr; SYSCHK(mmap((void*)0x300000UL, 0x100000, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_FIXED_NOREPLACE, fd, 0x100000)); for (int i=0; i<512; i++) *(volatile char *)(0x200000UL + 0x1000 * i); int uffd = SYSCHK(syscall(__NR_userfaultfd, UFFD_USER_MODE_ONLY)); struct uffdio_api api = { .api = UFFD_API, .features = 0 }; SYSCHK(ioctl(uffd, UFFDIO_API, &api)); struct uffdio_register reg = { .range = { .start = 0x200000, .len = 0x200000 }, .mode = UFFDIO_REGISTER_MODE_MISSING }; SYSCHK(ioctl(uffd, UFFDIO_REGISTER, ®)); pthread_t thread; if (pthread_create(&thread, NULL, thread_fn, NULL)) errx(1, "pthread_create"); sleep(1); unsigned char dummy_page[0x1000] = {1}; struct uffdio_copy copy = { .dst = 0x201000, .src = (unsigned long)dummy_page, .len = 0x1000, .mode = 0 }; SYSCHK(ioctl(uffd, UFFDIO_COPY, ©)); if (pthread_join(thread, NULL)) errx(1, "pthread_join"); //system("cat /proc/$PPID/smaps"); }