The patch titled Subject: mm/hmm: change hmm_vma_fault() to allow write fault on page basis has been added to the -mm tree. Its filename is mm-hmm-change-hmm_vma_fault-to-allow-write-fault-on-page-basis.patch This patch should soon appear at http://ozlabs.org/~akpm/mmots/broken-out/mm-hmm-change-hmm_vma_fault-to-allow-write-fault-on-page-basis.patch and later at http://ozlabs.org/~akpm/mmotm/broken-out/mm-hmm-change-hmm_vma_fault-to-allow-write-fault-on-page-basis.patch Before you just go and hit "reply", please: a) Consider who else should be cc'ed b) Prefer to cc a suitable mailing list as well c) Ideally: find the original patch on the mailing list and do a reply-to-all to that, adding suitable additional cc's *** Remember to use Documentation/process/submit-checklist.rst when testing your code *** The -mm tree is included into linux-next and is updated there every 3-4 working days ------------------------------------------------------ From: Jérôme Glisse <jglisse@xxxxxxxxxx> Subject: mm/hmm: change hmm_vma_fault() to allow write fault on page basis Change hmm_vma_fault() to not take a global write fault flag for a range but instead rely on the caller to populate HMM pfns array with proper fault flag ie HMM_PFN_VALID if the driver wants a read fault for that address or HMM_PFN_VALID and HMM_PFN_WRITE for write. Moreover by setting HMM_PFN_DEVICE_PRIVATE the device driver can ask for device private memory to be migrated back to system memory through page faults. This is a more flexible API and it better reflects how a device handles and reports faults. Link: http://lkml.kernel.org/r/20180316203552.4155-4-jglisse@xxxxxxxxxx Signed-off-by: Jérôme Glisse <jglisse@xxxxxxxxxx> Cc: Evgeny Baskakov <ebaskakov@xxxxxxxxxx> Cc: Ralph Campbell <rcampbell@xxxxxxxxxx> Cc: Mark Hairgrove <mhairgrove@xxxxxxxxxx> Cc: John Hubbard <jhubbard@xxxxxxxxxx> Signed-off-by: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx> --- include/linux/hmm.h | 2 mm/hmm.c | 152 ++++++++++++++++++++++++++++++++---------- 2 files changed, 120 insertions(+), 34 deletions(-) diff -puN include/linux/hmm.h~mm-hmm-change-hmm_vma_fault-to-allow-write-fault-on-page-basis include/linux/hmm.h --- a/include/linux/hmm.h~mm-hmm-change-hmm_vma_fault-to-allow-write-fault-on-page-basis +++ a/include/linux/hmm.h @@ -317,7 +317,7 @@ bool hmm_vma_range_done(struct hmm_range * * See the function description in mm/hmm.c for further documentation. */ -int hmm_vma_fault(struct hmm_range *range, bool write, bool block); +int hmm_vma_fault(struct hmm_range *range, bool block); #endif /* IS_ENABLED(CONFIG_HMM_MIRROR) */ diff -puN mm/hmm.c~mm-hmm-change-hmm_vma_fault-to-allow-write-fault-on-page-basis mm/hmm.c --- a/mm/hmm.c~mm-hmm-change-hmm_vma_fault-to-allow-write-fault-on-page-basis +++ a/mm/hmm.c @@ -256,12 +256,10 @@ struct hmm_vma_walk { unsigned long last; bool fault; bool block; - bool write; }; -static int hmm_vma_do_fault(struct mm_walk *walk, - unsigned long addr, - uint64_t *pfn) +static int hmm_vma_do_fault(struct mm_walk *walk, unsigned long addr, + bool write_fault, uint64_t *pfn) { unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_REMOTE; struct hmm_vma_walk *hmm_vma_walk = walk->private; @@ -269,7 +267,7 @@ static int hmm_vma_do_fault(struct mm_wa int r; flags |= hmm_vma_walk->block ? 0 : FAULT_FLAG_ALLOW_RETRY; - flags |= hmm_vma_walk->write ? FAULT_FLAG_WRITE : 0; + flags |= write_fault ? FAULT_FLAG_WRITE : 0; r = handle_mm_fault(vma, addr, flags); if (r & VM_FAULT_RETRY) return -EBUSY; @@ -301,15 +299,17 @@ static int hmm_pfns_bad(unsigned long ad * hmm_vma_walk_hole() - handle a range back by no pmd or no pte * @start: range virtual start address (inclusive) * @end: range virtual end address (exclusive) + * @fault: should we fault or not ? + * @write_fault: write fault ? * @walk: mm_walk structure * Returns: 0 on success, -EAGAIN after page fault, or page fault error * * This is an helper call whenever pmd_none() or pte_none() returns true * or when there is no directory covering the range. */ -static int hmm_vma_walk_hole(unsigned long addr, - unsigned long end, - struct mm_walk *walk) +static int hmm_vma_walk_hole_(unsigned long addr, unsigned long end, + bool fault, bool write_fault, + struct mm_walk *walk) { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct hmm_range *range = hmm_vma_walk->range; @@ -320,16 +320,89 @@ static int hmm_vma_walk_hole(unsigned lo i = (addr - range->start) >> PAGE_SHIFT; for (; addr < end; addr += PAGE_SIZE, i++) { pfns[i] = 0; - if (hmm_vma_walk->fault) { + if (fault || write_fault) { int ret; - ret = hmm_vma_do_fault(walk, addr, &pfns[i]); + ret = hmm_vma_do_fault(walk, addr, write_fault, + &pfns[i]); if (ret != -EAGAIN) return ret; } } - return hmm_vma_walk->fault ? -EAGAIN : 0; + return (fault || write_fault) ? -EAGAIN : 0; +} + +static inline void hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk, + uint64_t pfns, uint64_t cpu_flags, + bool *fault, bool *write_fault) +{ + *fault = *write_fault = false; + if (!hmm_vma_walk->fault) + return; + + /* We aren't ask to do anything ... */ + if (!(pfns & HMM_PFN_VALID)) + return; + /* If CPU page table is not valid then we need to fault */ + *fault = cpu_flags & HMM_PFN_VALID; + /* Need to write fault ? */ + if ((pfns & HMM_PFN_WRITE) && !(cpu_flags & HMM_PFN_WRITE)) { + *fault = *write_fault = false; + return; + } + /* Do we fault on device memory ? */ + if ((pfns & HMM_PFN_DEVICE_PRIVATE) && + (cpu_flags & HMM_PFN_DEVICE_PRIVATE)) { + *write_fault = pfns & HMM_PFN_WRITE; + *fault = true; + } +} + +static void hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk, + const uint64_t *pfns, unsigned long npages, + uint64_t cpu_flags, bool *fault, + bool *write_fault) +{ + unsigned long i; + + if (!hmm_vma_walk->fault) { + *fault = *write_fault = false; + return; + } + + for (i = 0; i < npages; ++i) { + hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags, + fault, write_fault); + if ((*fault) || (*write_fault)) + return; + } +} + +static int hmm_vma_walk_hole(unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct hmm_vma_walk *hmm_vma_walk = walk->private; + struct hmm_range *range = hmm_vma_walk->range; + bool fault, write_fault; + unsigned long i, npages; + uint64_t *pfns; + + i = (addr - range->start) >> PAGE_SHIFT; + npages = (end - addr) >> PAGE_SHIFT; + pfns = &range->pfns[i]; + hmm_range_need_fault(hmm_vma_walk, pfns, npages, + 0, &fault, &write_fault); + return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); +} + +static inline uint64_t pmd_to_hmm_pfn_flags(pmd_t pmd) +{ + if (pmd_protnone(pmd)) + return 0; + return pmd_write(pmd) ? HMM_PFN_VALID | + HMM_PFN_WRITE : + HMM_PFN_VALID; } static int hmm_vma_handle_pmd(struct mm_walk *walk, @@ -339,14 +412,17 @@ static int hmm_vma_handle_pmd(struct mm_ pmd_t pmd) { struct hmm_vma_walk *hmm_vma_walk = walk->private; - unsigned long pfn, i; - uint64_t flag = 0; - - if (pmd_protnone(pmd)) - return hmm_vma_walk_hole(addr, end, walk); + unsigned long pfn, npages, i; + uint64_t flag = 0, cpu_flags; + bool fault, write_fault; + + npages = (end - addr) >> PAGE_SHIFT; + cpu_flags = pmd_to_hmm_pfn_flags(pmd); + hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags, + &fault, &write_fault); - if ((hmm_vma_walk->fault & hmm_vma_walk->write) && !pmd_write(pmd)) - return hmm_vma_walk_hole(addr, end, walk); + if (pmd_protnone(pmd) || fault || write_fault) + return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); pfn = pmd_pfn(pmd) + pte_index(addr); flag |= pmd_write(pmd) ? HMM_PFN_WRITE : 0; @@ -356,19 +432,33 @@ static int hmm_vma_handle_pmd(struct mm_ return 0; } +static inline uint64_t pte_to_hmm_pfn_flags(pte_t pte) +{ + if (pte_none(pte) || !pte_present(pte)) + return 0; + return pte_write(pte) ? HMM_PFN_VALID | + HMM_PFN_WRITE : + HMM_PFN_VALID; +} + static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, unsigned long end, pmd_t *pmdp, pte_t *ptep, uint64_t *pfns) { struct hmm_vma_walk *hmm_vma_walk = walk->private; struct vm_area_struct *vma = walk->vma; + bool fault, write_fault; + uint64_t cpu_flags; pte_t pte = *ptep; *pfns = 0; + cpu_flags = pte_to_hmm_pfn_flags(pte); + hmm_pte_need_fault(hmm_vma_walk, *pfns, cpu_flags, + &fault, &write_fault); if (pte_none(pte)) { *pfns = 0; - if (hmm_vma_walk->fault) + if (fault || write_fault) goto fault; return 0; } @@ -377,7 +467,7 @@ static int hmm_vma_handle_pte(struct mm_ swp_entry_t entry = pte_to_swp_entry(pte); if (!non_swap_entry(entry)) { - if (hmm_vma_walk->fault) + if (fault || write_fault) goto fault; return 0; } @@ -387,21 +477,20 @@ static int hmm_vma_handle_pte(struct mm_ * device and report anything else as error. */ if (is_device_private_entry(entry)) { + cpu_flags = HMM_PFN_VALID | HMM_PFN_DEVICE_PRIVATE; + cpu_flags |= is_write_device_private_entry(entry) ? + HMM_PFN_WRITE : 0; *pfns = hmm_pfn_from_pfn(swp_offset(entry)); - if (is_write_device_private_entry(entry)) { - *pfns |= HMM_PFN_WRITE; - } else if ((hmm_vma_walk->fault & hmm_vma_walk->write)) - goto fault; *pfns |= HMM_PFN_DEVICE_PRIVATE; return 0; } if (is_migration_entry(entry)) { - if (hmm_vma_walk->fault) { + if (fault || write_fault) { pte_unmap(ptep); hmm_vma_walk->last = addr; migration_entry_wait(vma->vm_mm, - pmdp, addr); + pmdp, addr); return -EAGAIN; } return 0; @@ -412,17 +501,16 @@ static int hmm_vma_handle_pte(struct mm_ return -EFAULT; } - if ((hmm_vma_walk->fault & hmm_vma_walk->write) && !pte_write(pte)) + if (fault || write_fault) goto fault; - *pfns = hmm_pfn_from_pfn(pte_pfn(pte)); - *pfns |= pte_write(pte) ? HMM_PFN_WRITE : 0; + *pfns = hmm_pfn_from_pfn(pte_pfn(pte)) | cpu_flags; return 0; fault: pte_unmap(ptep); /* Fault all pages in range if ask for */ - return hmm_vma_walk_hole(addr, end, walk); + return hmm_vma_walk_hole_(addr, end, fault, write_fault, walk); } static int hmm_vma_walk_pmd(pmd_t *pmdp, @@ -642,7 +730,6 @@ EXPORT_SYMBOL(hmm_vma_range_done); /* * hmm_vma_fault() - try to fault some address in a virtual address range * @range: range being faulted and all needed informations - * @write: is it a write fault * @block: allow blocking on fault (if true it sleeps and do not drop mmap_sem) * Returns: 0 success, error otherwise (-EAGAIN means mmap_sem have been drop) * @@ -684,7 +771,7 @@ EXPORT_SYMBOL(hmm_vma_range_done); * * YOU HAVE BEEN WARNED ! */ -int hmm_vma_fault(struct hmm_range *range, bool write, bool block) +int hmm_vma_fault(struct hmm_range *range, bool block) { struct vm_area_struct *vma = range->vma; unsigned long start = range->start; @@ -732,7 +819,6 @@ int hmm_vma_fault(struct hmm_range *rang } hmm_vma_walk.fault = true; - hmm_vma_walk.write = write; hmm_vma_walk.block = block; hmm_vma_walk.range = range; mm_walk.private = &hmm_vma_walk; _ Patches currently in -mm which might be from jglisse@xxxxxxxxxx are mm-hmm-fix-header-file-if-else-endif-maze.patch mm-hmm-hmm_pfns_bad-was-accessing-wrong-struct.patch mm-hmm-use-struct-for-hmm_vma_fault-hmm_vma_get_pfns-parameters.patch mm-hmm-remove-hmm_pfn_read-flag-and-ignore-peculiar-architecture.patch mm-hmm-use-uint64_t-for-hmm-pfn-instead-of-defining-hmm_pfn_t-to-ulong.patch mm-hmm-cleanup-special-vma-handling-vm_special.patch mm-hmm-do-not-differentiate-between-empty-entry-or-missing-directory.patch mm-hmm-rename-hmm_pfn_device_unaddressable-to-hmm_pfn_device_private.patch mm-hmm-move-hmm_pfns_clear-closer-to-where-it-is-use.patch mm-hmm-factor-out-pte-and-pmd-handling-to-simplify-hmm_vma_walk_pmd.patch mm-hmm-change-hmm_vma_fault-to-allow-write-fault-on-page-basis.patch mm-hmm-use-device-driver-encoding-for-hmm-pfn.patch -- To unsubscribe from this list: send the line "unsubscribe mm-commits" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html