On Wed, Dec 13, 2017 at 04:36:39PM -0800, Andrew Morton wrote: > On Thu, 14 Dec 2017 03:33:18 +0300 "Kirill A. Shutemov" <kirill@xxxxxxxxxxxxx> wrote: > > > On Wed, Dec 13, 2017 at 04:09:51PM -0800, Andrew Morton wrote: > > > > @@ -181,6 +182,40 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) > > > > #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) > > > > #endif > > > > > > > > +#ifndef pmdp_establish > > > > +#define pmdp_establish pmdp_establish > > > > +static inline pmd_t pmdp_establish(struct vm_area_struct *vma, > > > > + unsigned long address, pmd_t *pmdp, pmd_t pmd) > > > > +{ > > > > + pmd_t old; > > > > + > > > > + /* > > > > + * If pmd has present bit cleared we can get away without expensive > > > > + * cmpxchg64: we can update pmdp half-by-half without racing with > > > > + * anybody. > > > > + */ > > > > + if (!(pmd_val(pmd) & _PAGE_PRESENT)) { > > > > + union split_pmd old, new, *ptr; > > > > + > > > > + ptr = (union split_pmd *)pmdp; > > > > + > > > > + new.pmd = pmd; > > > > + > > > > + /* xchg acts as a barrier before setting of the high bits */ > > > > + old.pmd_low = xchg(&ptr->pmd_low, new.pmd_low); > > > > + old.pmd_high = ptr->pmd_high; > > > > + ptr->pmd_high = new.pmd_high; > > > > + return old.pmd; > > > > + } > > > > + > > > > + { > > > > + old = *pmdp; > > > > + } while (cmpxchg64(&pmdp->pmd, old.pmd, pmd.pmd) != old.pmd); > > > > > > um, what happened here? > > > > Ouch.. Yeah, we need 'do' here. :-/ > > > > Apparently, it's a valid C code that would run the body once and it worked for > > me because I didn't hit the race condition. > > So how the heck do we test this? Add an artificial delay on the other > side to open the race window? Okay, here's a testcase I've come up with: #include <pthread.h> #include <sys/mman.h> #define MAP_BASE ((void *)0x200000) #define MAP_SIZE (0x200000) void *thread(void *p) { *(char *)p = 1; return NULL; } int main(void) { pthread_t t; char *p; p = mmap(MAP_BASE, MAP_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, -1, 0); /* Allocate THP */ *p = 1; /* Make page and PMD clean */ madvise(p, MAP_SIZE, MADV_FREE); /* New thread would make the PMD dirty again */ pthread_create(&t, NULL, thread, p); /* Trigger split_huge_pmd(). It may lose dirty bit. */ mprotect(p, 4096, PROT_READ | PROT_WRITE | PROT_EXEC); /* * Wait thread to complete. * * Page or PTE by MAP_BASE address *must* be dirty here. If it's not we * can lose data if the page is reclaimed. */ pthread_join(t, NULL); return 0; } To make the bug triggered, I had to make race window larger. See patch below. I also check if the page is dirty on exit on kernel side. It's just easier than make the page reclaimed from userspace and check if data is still there. With this patch, the testcase can trigger the race condition within 10 or so runs in my KVM box. diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2f2f5e774902..06f6a74ac36d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -33,6 +33,7 @@ #include <linux/page_idle.h> #include <linux/shmem_fs.h> #include <linux/oom.h> +#include <linux/delay.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -2131,6 +2132,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, write = pmd_write(*pmd); young = pmd_young(*pmd); dirty = pmd_dirty(*pmd); + mdelay(100); soft_dirty = pmd_soft_dirty(*pmd); pmdp_huge_split_prepare(vma, haddr, pmd); diff --git a/mm/memory.c b/mm/memory.c index 5eb3d2524bdc..69a207bea1da 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1305,6 +1305,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, struct page *page; page = _vm_normal_page(vma, addr, ptent, true); + + if (addr == 0x200000) + BUG_ON(!(PageDirty(page) || pte_dirty(*pte))); + if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to -- Kirill A. Shutemov