On Mon, 2015-11-23 at 12:53 -0800, Dan Williams wrote: > On Mon, Nov 23, 2015 at 12:04 PM, Toshi Kani <toshi.kani@xxxxxxx> wrote: > > The following oops was observed when mmap() with MAP_POPULATE > > pre-faulted pmd mappings of a DAX file. follow_trans_huge_pmd() > > expects that a target address has a struct page. > > > > BUG: unable to handle kernel paging request at ffffea0012220000 > > follow_trans_huge_pmd+0xba/0x390 > > follow_page_mask+0x33d/0x420 > > __get_user_pages+0xdc/0x800 > > populate_vma_page_range+0xb5/0xe0 > > __mm_populate+0xc5/0x150 > > vm_mmap_pgoff+0xd5/0xe0 > > SyS_mmap_pgoff+0x1c1/0x290 > > SyS_mmap+0x1b/0x30 > > > > Fix it by making the PMD pre-fault handling consistent with PTE. > > After pre-faulted in faultin_page(), follow_page_mask() calls > > follow_trans_huge_pmd(), which is changed to call follow_pfn_pmd() > > for VM_PFNMAP or VM_MIXEDMAP. follow_pfn_pmd() handles FOLL_TOUCH > > and returns with -EEXIST. > > As of 4.4.-rc2 DAX pmd mappings are disabled. So we have time to do > something more comprehensive in 4.5. Yes, I noticed during my testing that I could not use pmd... > > Reported-by: Mauricio Porto <mauricio.porto@xxxxxxx> > > Signed-off-by: Toshi Kani <toshi.kani@xxxxxxx> > > Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> > > Cc: Kirill A. Shutemov <kirill.shutemov@xxxxxxxxxxxxxxx> > > Cc: Matthew Wilcox <willy@xxxxxxxxxxxxxxx> > > Cc: Dan Williams <dan.j.williams@xxxxxxxxx> > > Cc: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx> > > --- > > mm/huge_memory.c | 34 ++++++++++++++++++++++++++++++++++ > > 1 file changed, 34 insertions(+) > > > > diff --git a/mm/huge_memory.c b/mm/huge_memory.c > > index d5b8920..f56e034 100644 > > --- a/mm/huge_memory.c > > +++ b/mm/huge_memory.c > [..] > > @@ -1288,6 +1315,13 @@ struct page *follow_trans_huge_pmd(struct > > vm_area_struct *vma, > > if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) > > goto out; > > > > + /* pfn map does not have a struct page */ > > + if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) { > > + ret = follow_pfn_pmd(vma, addr, pmd, flags); > > + page = ERR_PTR(ret); > > + goto out; > > + } > > + > > page = pmd_page(*pmd); > > VM_BUG_ON_PAGE(!PageHead(page), page); > > if (flags & FOLL_TOUCH) { > > I think it is already problematic that dax pmd mappings are getting > confused with transparent huge pages. We had the same issue with dax pte mapping [1], and this change extends the pfn map handling to pmd. So, this problem is not specific to pmd. [1] https://lkml.org/lkml/2015/6/23/181 > They're more closely related to > a hugetlbfs pmd mappings in that they are mapping an explicit > allocation. I have some pending patches to address this dax-pmd vs > hugetlb-pmd vs thp-pmd classification that I will post shortly. Not sure which way is better, but I am certainly interested in your changes. > By the way, I'm collecting DAX pmd regression tests [1], is this just > a simple crash upon using MAP_POPULATE? > > [1]: https://github.com/pmem/ndctl/blob/master/lib/test-dax-pmd.c Yes, this issue is easy to reproduce with MAP_POPULATE. In case it helps, attached are the test I used for testing the patches. Sorry, the code is messy since it was only intended for my internal use... - The test was originally written for the pte change [1] and comments in test.sh (ex. mlock fail, ok) reflect the results without the pte change. - For the pmd test, I modified test-mmap.c to call posix_memalign() before mmap(). By calling free(), the 2MB-aligned address from posix_memalign() can be used for mmap(). This keeps the mmap'd address aligned on 2MB. - I created test file(s) with dd (i.e. all blocks written) in my test. - The other infinite loop issue (fixed by my other patch) was found by the test case with option "-LMSr". Thanks, -Toshi
Attachment:
test.sh
Description: application/shellscript
#include <sys/types.h> #include <sys/stat.h> #include <sys/mman.h> #include <sys/time.h> #include <string.h> #include <fcntl.h> #include <stdio.h> #include <stdlib.h> #include <unistd.h> #define MB(a) ((a) * 1024UL * 1024UL) static struct timeval start_tv, stop_tv; // Calculate the difference between two time values. void tvsub(struct timeval *tdiff, struct timeval *t1, struct timeval *t0) { tdiff->tv_sec = t1->tv_sec - t0->tv_sec; tdiff->tv_usec = t1->tv_usec - t0->tv_usec; if (tdiff->tv_usec < 0) tdiff->tv_sec--, tdiff->tv_usec += 1000000; } // Start timing now. void start() { (void) gettimeofday(&start_tv, (struct timezone *) 0); } // Stop timing and return real time in microseconds. unsigned long long stop() { struct timeval tdiff; (void) gettimeofday(&stop_tv, (struct timezone *) 0); tvsub(&tdiff, &stop_tv, &start_tv); return (tdiff.tv_sec * 1000000 + tdiff.tv_usec); } void test_write(unsigned long *p, size_t size) { int i; unsigned long *wp, tmp; unsigned long long timeval; start(); for (i=0, wp=p; i<(size/sizeof(wp)); i++) *wp++ = 1; timeval = stop(); printf("Write: %10llu usec\n", timeval); } void test_read(unsigned long *p, size_t size) { int i; unsigned long *wp, tmp; unsigned long long timeval; start(); for (i=0, wp=p; i<(size/sizeof(wp)); i++) tmp = *wp++; timeval = stop(); printf("Read : %10llu usec\n", timeval); } int main(int argc, char **argv) { int fd, i, opt, ret; int oflags, mprot, mflags = 0; int is_read_only = 0, is_mlock = 0, is_mlockall = 0; int mlock_skip = 0, read_test = 0, write_test = 0; void *mptr = NULL; unsigned long *p; struct stat stat; size_t size, cpy_size; const char *file_name = NULL; while ((opt = getopt(argc, argv, "LRMSApsrw")) != -1) { switch (opt) { case 'L': file_name = "/mnt/pmem0/4Gfile"; break; case 'R': printf("> mmap: read-only\n"); is_read_only = 1; break; case 'M': printf("> mlock\n"); is_mlock = 1; break; case 'S': printf("> mlock - skip first ite\n"); mlock_skip = 1; break; case 'A': printf("> mlockall\n"); is_mlockall = 1; break; case 'p': printf("> MAP_POPULATE\n"); mflags |= MAP_POPULATE; break; case 's': printf("> MAP_SHARED\n"); mflags |= MAP_SHARED; break; case 'r': printf("> read-test\n"); read_test = 1; break; case 'w': printf("> write-test\n"); write_test = 1; break; } } if (!file_name) { file_name = "/mnt/pmem1/32Kfile"; } if (!(mflags & MAP_SHARED)) { printf("> MAP_PRIVATE\n"); mflags |= MAP_PRIVATE; } if (is_read_only) { oflags = O_RDONLY; mprot = PROT_READ; } else { oflags = O_RDWR; mprot = PROT_READ|PROT_WRITE; } fd = open(file_name, oflags); if (fd == -1) { perror("open failed"); exit(1); } ret = fstat(fd, &stat); if (ret < 0) { perror("fstat failed"); exit(1); } size = stat.st_size; printf("> open %s size 0x%x flags 0x%x\n", file_name, size, oflags); ret = posix_memalign(&mptr, MB(2), size); if (ret ==0) free(mptr); printf("> mmap mprot 0x%x flags 0x%x\n", mprot, mflags); p = mmap(mptr, size, mprot, mflags, fd, 0x0); if (!p) { perror("mmap failed"); exit(1); } if ((long unsigned)p & (MB(2)-1)) printf("> mmap: NOT 2MB aligned: 0x%p\n", p); else printf("> mmap: 2MB aligned: 0x%p\n", p); #if 0 /* SIZE LIMIT */ if (size >= MB(2)) cpy_size = MB(32); else #endif cpy_size = size; for (i=0; i<3; i++) { if (is_mlock && !mlock_skip) { printf("> mlock 0x%p\n", p); ret = mlock(p, size); if (ret < 0) { perror("mlock failed"); exit(1); } } else if (is_mlockall) { printf("> mlockall\n"); ret = mlockall(MCL_CURRENT|MCL_FUTURE); if (ret < 0) { perror("mlockall failed"); exit(1); } } printf("===== %d =====\n", i+1); if (write_test) test_write(p, cpy_size); if (read_test) test_read(p, cpy_size); if (is_mlock && !mlock_skip) { printf("> munlock 0x%p\n", p); ret = munlock(p, size); if (ret < 0) { perror("munlock failed"); exit(1); } } else if (is_mlockall) { printf("> munlockall\n"); ret = munlockall(); if (ret < 0) { perror("munlockall failed"); exit(1); } } /* skip, if requested, only the first iteration */ mlock_skip = 0; } printf("> munmap 0x%p\n", p); munmap(p, size); }