[I'm not sure who the best people to ask about this are, I hope the recipient list resembles something reasonable...] I have noticed that the dup_mmap() logic on fork() doesn't handle pages with active direct I/O properly: dup_mmap() seems to assume that making the PTE referencing a page readonly will always prevent future writes to the page, but if the kernel has acquired a direct reference to the page before (e.g. via get_user_pages_fast()), writes can still happen that way. The worst-case effect of this - as far as I can tell - is that when a multithreaded process forks while one thread is in the middle of sys_read() on a file that uses direct I/O with get_user_pages_fast(), the read data can become visible in the child while the parent's buffer stays uninitialized if the parent writes to a relevant page post-fork before either the I/O completes or the child writes to it. Reproducer code: ====== START hello.c ====== #define FUSE_USE_VERSION 26 #include <fuse.h> #include <stdio.h> #include <string.h> #include <errno.h> #include <fcntl.h> #include <unistd.h> #include <err.h> #include <sys/uio.h> static const char *hello_path = "/hello"; static int hello_getattr(const char *path, struct stat *stbuf) { int res = 0; memset(stbuf, 0, sizeof(struct stat)); if (strcmp(path, "/") == 0) { stbuf->st_mode = S_IFDIR | 0755; stbuf->st_nlink = 2; } else if (strcmp(path, hello_path) == 0) { stbuf->st_mode = S_IFREG | 0666; stbuf->st_nlink = 1; stbuf->st_size = 0x1000; stbuf->st_blocks = 0; } else res = -ENOENT; return res; } static int hello_readdir(const char *path, void *buf, fuse_fill_dir_t filler, off_t offset, struct fuse_file_info *fi) { filler(buf, ".", NULL, 0); filler(buf, "..", NULL, 0); filler(buf, hello_path + 1, NULL, 0); return 0; } static int hello_open(const char *path, struct fuse_file_info *fi) { return 0; } static int hello_read(const char *path, char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { sleep(3); size_t len = 0x1000; if (offset < len) { if (offset + size > len) size = len - offset; memset(buf, 0, size); } else size = 0; return size; } static int hello_write(const char *path, const char *buf, size_t size, off_t offset, struct fuse_file_info *fi) { while(1) pause(); } static struct fuse_operations hello_oper = { .getattr = hello_getattr, .readdir = hello_readdir, .open = hello_open, .read = hello_read, .write = hello_write, }; int main(int argc, char *argv[]) { return fuse_main(argc, argv, &hello_oper, NULL); } ====== END hello.c ====== ====== START simple_mmap.c ====== #define _GNU_SOURCE #include <pthread.h> #include <sys/mman.h> #include <err.h> #include <unistd.h> #include <fcntl.h> #include <stdio.h> #include <signal.h> #include <sys/prctl.h> #include <sys/wait.h> __attribute__((aligned(0x1000))) char data_buffer_[0x10000]; #define data_buffer (data_buffer_ + 0x8000) void *fuse_thread(void *dummy) { /* step 2: start direct I/O on data_buffer */ int fuse_fd = open("mount/hello", O_RDWR); if (fuse_fd == -1) err(1, "unable to open FUSE fd"); printf("char in parent (before): %hhd\n", data_buffer[0]); int res = read(fuse_fd, data_buffer, 0x1000); /* step 6: read completes, show post-read state */ printf("fuse read result: %d\n", res); printf("char in parent (after): %hhd\n", data_buffer[0]); } int main(void) { /* step 1: make data_buffer dirty */ data_buffer[0] = 1; pthread_t thread; if (pthread_create(&thread, NULL, fuse_thread, NULL)) errx(1, "pthread_create"); sleep(1); /* step 3: fork a child */ pid_t child = fork(); if (child == -1) err(1, "fork"); if (child == 0) { prctl(PR_SET_PDEATHSIG, SIGKILL); sleep(1); /* step 5: show pre-read state in the child */ printf("char in child (before): %hhd\n", data_buffer[0]); sleep(3); /* step 7: read is complete, show post-read state in child */ printf("char in child (after): %hhd\n", data_buffer[0]); return 0; } /* step 4: de-CoW data_buffer in the parent */ data_buffer[0x800] = 2; int status; if (wait(&status) != child) err(1, "wait"); } ====== END simple_mmap.c ====== Repro steps: In one terminal: $ mkdir mount $ gcc -o hello hello.c -Wall -std=gnu99 `pkg-config fuse --cflags --libs` hello.c: In function ‘hello_write’: hello.c:67:1: warning: no return statement in function returning non-void [-Wreturn-type] } ^ $ ./hello -d -o direct_io mount FUSE library version: 2.9.7 [...] In a second terminal: $ gcc -pthread -o simple_mmap simple_mmap.c $ ./simple_mmap char in parent (before): 1 char in child (before): 1 fuse read result: 4096 char in parent (after): 1 char in child (after): 0 I have tested that this still works on 4.19.0-rc3+. As far as I can tell, the fix would be to immediately copy pages with `refcount - mapcount > N` in dup_mmap(), or something like that?