On Tue, Oct 22, 2024 at 04:47:19AM +0100, Matthew Wilcox wrote: > On Tue, Oct 22, 2024 at 02:14:39AM +0000, Roman Gushchin wrote: > > On Mon, Oct 21, 2024 at 09:34:24PM +0100, Matthew Wilcox wrote: > > > On Mon, Oct 21, 2024 at 05:34:55PM +0000, Roman Gushchin wrote: > > > > Fix it by moving the mlocked flag clearance down to > > > > free_page_prepare(). > > > > > > Urgh, I don't like this new reference to folio in free_pages_prepare(). > > > It feels like a layering violation. I'll think about where else we > > > could put this. > > > > I agree, but it feels like it needs quite some work to do it in a nicer way, > > no way it can be backported to older kernels. As for this fix, I don't > > have better ideas... > > Well, what is KVM doing that causes this page to get mapped to userspace? > Don't tell me to look at the reproducer as it is 403 Forbidden. All I > can tell is that it's freed with vfree(). > > Is it from kvm_dirty_ring_get_page()? That looks like the obvious thing, > but I'd hate to spend a lot of time on it and then discover I was looking > at the wrong thing. One of the pages is vcpu->run, others belong to kvm->coalesced_mmio_ring. Here is the reproducer: #define _GNU_SOURCE #include <endian.h> #include <fcntl.h> #include <stdint.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/mount.h> #include <sys/stat.h> #include <sys/syscall.h> #include <sys/types.h> #include <unistd.h> #ifndef __NR_mlock2 #define __NR_mlock2 325 #endif uint64_t r[3] = {0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff}; #ifndef KVM_CREATE_VM #define KVM_CREATE_VM 0xae01 #endif #ifndef KVM_CREATE_VCPU #define KVM_CREATE_VCPU 0xae41 #endif int main(void) { syscall(__NR_mmap, /*addr=*/0x1ffff000ul, /*len=*/0x1000ul, /*prot=*/0ul, /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1, /*offset=*/0ul); syscall(__NR_mmap, /*addr=*/0x20000000ul, /*len=*/0x1000000ul, /*prot=PROT_WRITE|PROT_READ|PROT_EXEC*/ 7ul, /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1, /*offset=*/0ul); syscall(__NR_mmap, /*addr=*/0x21000000ul, /*len=*/0x1000ul, /*prot=*/0ul, /*flags=MAP_FIXED|MAP_ANONYMOUS|MAP_PRIVATE*/ 0x32ul, /*fd=*/-1, /*offset=*/0ul); intptr_t res = syscall(__NR_openat, /*fd=*/0xffffff9c, /*file=*/"/dev/kvm", /*flags=*/0, /*mode=*/0); if (res != -1) r[0] = res; res = syscall(__NR_ioctl, /*fd=*/r[0], /*cmd=*/KVM_CREATE_VM, /*type=*/0ul); if (res != -1) r[1] = res; res = syscall(__NR_ioctl, /*fd=*/r[1], /*cmd=*/KVM_CREATE_VCPU, /*id=*/0ul); if (res != -1) r[2] = res; syscall(__NR_mmap, /*addr=*/0x20000000ul, /*len=*/0xb36000ul, /*prot=PROT_SEM|PROT_WRITE|PROT_READ|PROT_EXEC*/ 0xful, /*flags=MAP_FIXED|MAP_SHARED*/ 0x11ul, /*fd=*/r[2], /*offset=*/0ul); syscall(__NR_mlock2, /*addr=*/0x20000000ul, /*size=*/0x400000ul, /*flags=*/0ul); syscall(__NR_mremap, /*addr=*/0x200ab000ul, /*len=*/0x1000ul, /*newlen=*/0x1000ul, /*flags=MREMAP_DONTUNMAP|MREMAP_FIXED|MREMAP_MAYMOVE*/ 7ul, /*newaddr=*/0x20ffc000ul); return 0; }