Hi, While running xfstests test generic/437, I noticed that the following WARN_ON_ONCE inside cow_user_page() was triggered: /* * This really shouldn't fail, because the page is there * in the page tables. But it might just be unreadable, * in which case we just give up and fill the result with * zeroes. */ if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) { /* * Give a warn in case there can be some obscure * use-case */ WARN_ON_ONCE(1); clear_page(kaddr); } Just clearing the page, in this case, will result in data corruption. I think the assumption that the copy fails because the memory is inaccessible may be wrong. In this instance, it may be the other thread issuing the madvise call? I reverted the commit in question, and the data corruption is gone. Below is the (ppc64) stack trace (this will reproduce on x86 as well). I've attached the reproducer, which is a modified version of the xfs test. You'll need to setup a file system on pmem, and mount it with -o dax. Then issue "./t_mmap_cow_race /mnt/pmem/foo". Any help tracking this down is appreciated. Thanks! Jeff [ 3690.894950] run fstests generic/437 at 2020-02-10 13:40:37 [ 3691.173277] ------------[ cut here ]------------ [ 3691.173302] WARNING: CPU: 76 PID: 51024 at mm/memory.c:2317 wp_page_copy+0xc40/0xd50 [ 3691.173308] Modules linked in: ext4 mbcache jbd2 loop rfkill ib_isert iscsi_target_mod ib_srpt target_core_mod ib_srp scsi_transport_srp i2c_dev rpcrdma sunrpc rdma_ucm ib_iser ib_umad rdma_cm iw_cm libiscsi scsi_transport_iscsi ib_ipoib ib_cm mlx5_ib ib_uverbs ib_core dax_pmem_compat device_dax nd_pmem nd_btt dax_pmem_core of_pmem libnvdimm ses enclosure ipmi_powernv ipmi_devintf sg ibmpowernv xts ipmi_msghandler opal_prd vmx_crypto leds_powernv powernv_op_panel uio_pdrv_genirq uio xfs libcrc32c sd_mod t10_pi mlx5_core ipr mpt3sas libata raid_class tg3 scsi_transport_sas mlxfw dm_mirror dm_region_hash dm_log dm_mod [last unloaded: scsi_debug] [ 3691.173363] CPU: 76 PID: 51024 Comm: t_mmap_cow_race Not tainted 5.5.0+ #4 [ 3691.173369] NIP: c0000000003df6e0 LR: c0000000003df42c CTR: 0000000000007f10 [ 3691.173375] REGS: c000002ec66fb830 TRAP: 0700 Not tainted (5.5.0+) [ 3691.173379] MSR: 9000000002029033 <SF,HV,VEC,EE,ME,IR,DR,RI,LE> CR: 24022244 XER: 20040000 [ 3691.173388] CFAR: c0000000003df458 IRQMASK: 0 GPR00: 0000000000000000 c000002ec66fbac0 c000000001746a00 0000000000007f10 GPR04: 00007fff976080f0 0000000000007f10 00000000000080f0 0000000000000000 GPR08: 0000000000000000 c000000000000000 c000002de046a000 0000000000000030 GPR12: 0000000000000040 c000002ffffa9800 0000000014a70278 00007fff981f0000 GPR16: 00007fff98f24410 00007fff98f24420 00007fff97600000 0000000014a70270 GPR20: 0000000010000fc0 c000002fa974ebf0 00007fff97600000 c0000000017fd9e8 GPR24: c0000000017fd960 c000000001775e98 c000002eabcc5a00 c000002fc3b40000 GPR28: 0000000000000000 c000002fa974ebf0 c00c00000bf0ed00 c000002ec66fbc10 [ 3691.173430] NIP [c0000000003df6e0] wp_page_copy+0xc40/0xd50 [ 3691.173434] LR [c0000000003df42c] wp_page_copy+0x98c/0xd50 [ 3691.173438] Call Trace: [ 3691.173442] [c000002ec66fbac0] [c0000000003df42c] wp_page_copy+0x98c/0xd50 (unreliable) [ 3691.173448] [c000002ec66fbb80] [c0000000003e3448] do_wp_page+0xd8/0xad0 [ 3691.173454] [c000002ec66fbbd0] [c0000000003e6248] __handle_mm_fault+0x748/0x1b90 [ 3691.173460] [c000002ec66fbcd0] [c0000000003e77b0] handle_mm_fault+0x120/0x1f0 [ 3691.173466] [c000002ec66fbd10] [c000000000086b60] __do_page_fault+0x240/0xd70 [ 3691.173473] [c000002ec66fbde0] [c0000000000876c8] do_page_fault+0x38/0xd0 [ 3691.173480] [c000002ec66fbe20] [c00000000000a888] handle_page_fault+0x10/0x30 [ 3691.173484] Instruction dump: [ 3691.173488] 38800003 4bfffaf8 60000000 60000000 7f83e378 4800d585 60000000 4bfffcb8 [ 3691.173496] 7f83e378 4bfa72b5 60000000 4bfffc8c <0fe00000> 3d42ffeb 394acb48 812a0008 [ 3691.173503] ---[ end trace a8dffbf7f73e8243 ]---
// SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2017 Intel Corporation. */ #include <errno.h> #include <fcntl.h> #include <libgen.h> #include <pthread.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/mman.h> #include <sys/stat.h> #include <sys/types.h> #include <unistd.h> #define MiB(a) ((a)*1024*1024) #define NUM_THREADS 2 void err_exit(char *op) { fprintf(stderr, "%s: %s\n", op, strerror(errno)); exit(1); } void worker_fn(void *ptr) { char *data = (char *)ptr; volatile int a; volatile int d; int i, err; for (i = 0; i < 10; i++) { a = data[0]; data[0] = a; err = madvise(data, MiB(2), MADV_DONTNEED); if (err < 0) err_exit("madvise"); for (i = 0; i < MiB(2); i++) { if ((d = data[i]) != 1) { fprintf(stderr, "Data corruption!\n"); fprintf(stderr, "data[%d] = %d\n", i, d); exit(1); } } /* Mix up the thread timings to encourage the race. */ err = usleep(rand() % 100); if (err < 0) err_exit("usleep"); } } int main(int argc, char *argv[]) { pthread_t thread[NUM_THREADS]; int i, j, fd, err; char *data; if (argc < 2) { printf("Usage: %s <file>\n", basename(argv[0])); exit(0); } fd = open(argv[1], O_RDWR|O_CREAT, S_IRUSR|S_IWUSR); if (fd < 0) err_exit("fd"); /* This allows us to map a huge page. */ ftruncate(fd, 0); ftruncate(fd, MiB(2)); /* * First we set up a shared mapping. Our write will (hopefully) get * the filesystem to give us a 2MiB huge page DAX mapping. We will * then use this 2MiB page for our private mapping race. */ data = mmap(NULL, MiB(2), PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); if (data == MAP_FAILED) err_exit("shared mmap"); memset(data, 1, MiB(2)); err = munmap(data, MiB(2)); if (err < 0) err_exit("shared munmap"); for (i = 0; i < 500; i++) { data = mmap(NULL, MiB(2), PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0); if (data == MAP_FAILED) err_exit("private mmap"); for (j = 0; j < NUM_THREADS; j++) { err = pthread_create(&thread[j], NULL, (void*)&worker_fn, data); if (err) err_exit("pthread_create"); } for (j = 0; j < NUM_THREADS; j++) { err = pthread_join(thread[j], NULL); if (err) err_exit("pthread_join"); } err = munmap(data, MiB(2)); if (err < 0) err_exit("private munmap"); } err = close(fd); if (err < 0) err_exit("close"); return 0; }