Am 2021-11-18 um 1:53 a.m. schrieb Alistair Popple: > On Tuesday, 16 November 2021 6:30:18 AM AEDT Alex Sierra wrote: >> Device memory that is cache coherent from device and CPU point of view. >> This is used on platforms that have an advanced system bus (like CAPI >> or CXL). Any page of a process can be migrated to such memory. However, >> no one should be allowed to pin such memory so that it can always be >> evicted. >> >> Signed-off-by: Alex Sierra <alex.sierra@xxxxxxx> >> --- >> include/linux/memremap.h | 8 ++++++++ >> include/linux/mm.h | 9 +++++++++ >> mm/memcontrol.c | 6 +++--- >> mm/memory-failure.c | 6 +++++- >> mm/memremap.c | 5 ++++- >> mm/migrate.c | 21 +++++++++++++-------- >> 6 files changed, 42 insertions(+), 13 deletions(-) >> >> diff --git a/include/linux/memremap.h b/include/linux/memremap.h >> index c0e9d35889e8..ff4d398edf35 100644 >> --- a/include/linux/memremap.h >> +++ b/include/linux/memremap.h >> @@ -39,6 +39,13 @@ struct vmem_altmap { >> * A more complete discussion of unaddressable memory may be found in >> * include/linux/hmm.h and Documentation/vm/hmm.rst. >> * >> + * MEMORY_DEVICE_COHERENT: >> + * Device memory that is cache coherent from device and CPU point of view. This >> + * is used on platforms that have an advanced system bus (like CAPI or CXL). A >> + * driver can hotplug the device memory using ZONE_DEVICE and with that memory >> + * type. Any page of a process can be migrated to such memory. However no one >> + * should be allowed to pin such memory so that it can always be evicted. >> + * >> * MEMORY_DEVICE_FS_DAX: >> * Host memory that has similar access semantics as System RAM i.e. DMA >> * coherent and supports page pinning. In support of coordinating page >> @@ -59,6 +66,7 @@ struct vmem_altmap { >> enum memory_type { >> /* 0 is reserved to catch uninitialized type fields */ >> MEMORY_DEVICE_PRIVATE = 1, >> + MEMORY_DEVICE_COHERENT, >> MEMORY_DEVICE_FS_DAX, >> MEMORY_DEVICE_GENERIC, >> MEMORY_DEVICE_PCI_P2PDMA, >> diff --git a/include/linux/mm.h b/include/linux/mm.h >> index 73a52aba448f..d23b6ebd2177 100644 >> --- a/include/linux/mm.h >> +++ b/include/linux/mm.h >> @@ -1162,6 +1162,7 @@ static inline bool page_is_devmap_managed(struct page *page) >> return false; >> switch (page->pgmap->type) { >> case MEMORY_DEVICE_PRIVATE: >> + case MEMORY_DEVICE_COHERENT: >> case MEMORY_DEVICE_FS_DAX: >> return true; >> default: >> @@ -1191,6 +1192,14 @@ static inline bool is_device_private_page(const struct page *page) >> page->pgmap->type == MEMORY_DEVICE_PRIVATE; >> } >> >> +static inline bool is_device_page(const struct page *page) >> +{ >> + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && >> + is_zone_device_page(page) && >> + (page->pgmap->type == MEMORY_DEVICE_PRIVATE || >> + page->pgmap->type == MEMORY_DEVICE_COHERENT); >> +} >> + >> static inline bool is_pci_p2pdma_page(const struct page *page) >> { >> return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && >> diff --git a/mm/memcontrol.c b/mm/memcontrol.c >> index 6da5020a8656..e0275ebe1198 100644 >> --- a/mm/memcontrol.c >> +++ b/mm/memcontrol.c >> @@ -5695,8 +5695,8 @@ static int mem_cgroup_move_account(struct page *page, >> * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a >> * target for charge migration. if @target is not NULL, the entry is stored >> * in target->ent. >> - * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE >> - * (so ZONE_DEVICE page and thus not on the lru). >> + * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_COHERENT >> + * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru). >> * For now we such page is charge like a regular page would be as for all >> * intent and purposes it is just special memory taking the place of a >> * regular page. >> @@ -5730,7 +5730,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, >> */ >> if (page_memcg(page) == mc.from) { >> ret = MC_TARGET_PAGE; >> - if (is_device_private_page(page)) >> + if (is_device_page(page)) >> ret = MC_TARGET_DEVICE; >> if (target) >> target->page = page; >> diff --git a/mm/memory-failure.c b/mm/memory-failure.c >> index 3e6449f2102a..51b55fc5172c 100644 >> --- a/mm/memory-failure.c >> +++ b/mm/memory-failure.c >> @@ -1554,12 +1554,16 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, >> goto unlock; >> } >> >> - if (pgmap->type == MEMORY_DEVICE_PRIVATE) { >> + switch (pgmap->type) { >> + case MEMORY_DEVICE_PRIVATE: >> + case MEMORY_DEVICE_COHERENT: >> /* >> * TODO: Handle HMM pages which may need coordination >> * with device-side memory. >> */ >> goto unlock; >> + default: >> + break; >> } >> >> /* >> diff --git a/mm/memremap.c b/mm/memremap.c >> index ed593bf87109..94d6a1e01d42 100644 >> --- a/mm/memremap.c >> +++ b/mm/memremap.c >> @@ -44,6 +44,7 @@ EXPORT_SYMBOL(devmap_managed_key); >> static void devmap_managed_enable_put(struct dev_pagemap *pgmap) >> { >> if (pgmap->type == MEMORY_DEVICE_PRIVATE || >> + pgmap->type == MEMORY_DEVICE_COHERENT || >> pgmap->type == MEMORY_DEVICE_FS_DAX) >> static_branch_dec(&devmap_managed_key); >> } >> @@ -51,6 +52,7 @@ static void devmap_managed_enable_put(struct dev_pagemap *pgmap) >> static void devmap_managed_enable_get(struct dev_pagemap *pgmap) >> { >> if (pgmap->type == MEMORY_DEVICE_PRIVATE || >> + pgmap->type == MEMORY_DEVICE_COHERENT || >> pgmap->type == MEMORY_DEVICE_FS_DAX) >> static_branch_inc(&devmap_managed_key); >> } >> @@ -328,6 +330,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) >> >> switch (pgmap->type) { >> case MEMORY_DEVICE_PRIVATE: >> + case MEMORY_DEVICE_COHERENT: >> if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) { >> WARN(1, "Device private memory not supported\n"); >> return ERR_PTR(-EINVAL); >> @@ -498,7 +501,7 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap); >> void free_devmap_managed_page(struct page *page) >> { >> /* notify page idle for dax */ >> - if (!is_device_private_page(page)) { >> + if (!is_device_page(page)) { >> wake_up_var(&page->_refcount); >> return; >> } >> diff --git a/mm/migrate.c b/mm/migrate.c >> index 1852d787e6ab..f74422a42192 100644 >> --- a/mm/migrate.c >> +++ b/mm/migrate.c >> @@ -362,7 +362,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page) >> * Device private pages have an extra refcount as they are >> * ZONE_DEVICE pages. >> */ >> - expected_count += is_device_private_page(page); >> + expected_count += is_device_page(page); >> if (mapping) >> expected_count += thp_nr_pages(page) + page_has_private(page); >> >> @@ -2503,7 +2503,7 @@ static bool migrate_vma_check_page(struct page *page) >> * FIXME proper solution is to rework migration_entry_wait() so >> * it does not need to take a reference on page. >> */ > Note that I have posted a patch to fix this - see > https://lore.kernel.org/all/20211118020754.954425-1-apopple@xxxxxxxxxx/ This > looks ok for now assuming coherent pages can never be pinned. > > However that raises a question - what happens when something calls > get_user_pages() on a pfn pointing to a coherent device page? I can't see > anything in this series that prevents pinning of coherent device pages, so we > can't just assume they aren't pinned. I agree. I think we need to depend on your patch to go in first. I'm also wondering if we need to do something to prevent get_user_pages from pinning device pages. And by "pin", I think migrate_vma_check_page is not talking about FOLL_PIN, but any get_user_pages call. As far as I can tell, there should be nothing fundamentally wrong with pinning device pages for a short time. But I think we'll want to avoid FOLL_LONGTERM because that would affect our memory manager's ability to evict device memory. > > In the case of device-private pages this is enforced by the fact they never > have present pte's, so any attempt to GUP them results in a fault. But if I'm > understanding this series correctly that won't be the case for coherent device > pages right? Right. Regards, Felix > >> - return is_device_private_page(page); >> + return is_device_page(page); >> } >> >> /* For file back page */ >> @@ -2791,7 +2791,7 @@ EXPORT_SYMBOL(migrate_vma_setup); >> * handle_pte_fault() >> * do_anonymous_page() >> * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE >> - * private page. >> + * private or coherent page. >> */ >> static void migrate_vma_insert_page(struct migrate_vma *migrate, >> unsigned long addr, >> @@ -2867,10 +2867,15 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, >> swp_entry = make_readable_device_private_entry( >> page_to_pfn(page)); >> entry = swp_entry_to_pte(swp_entry); >> + } else if (is_device_page(page)) { > How about adding an explicit `is_device_coherent_page()` helper? It would make > the test more explicit that this is expected to handle just coherent pages and > I bet there will be future changes that need to differentiate between private > and coherent pages anyway. > >> + entry = pte_mkold(mk_pte(page, >> + READ_ONCE(vma->vm_page_prot))); >> + if (vma->vm_flags & VM_WRITE) >> + entry = pte_mkwrite(pte_mkdirty(entry)); >> } else { >> /* >> - * For now we only support migrating to un-addressable >> - * device memory. >> + * We support migrating to private and coherent types >> + * for device zone memory. >> */ >> pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); >> goto abort; >> @@ -2976,10 +2981,10 @@ void migrate_vma_pages(struct migrate_vma *migrate) >> mapping = page_mapping(page); >> >> if (is_zone_device_page(newpage)) { >> - if (is_device_private_page(newpage)) { >> + if (is_device_page(newpage)) { >> /* >> - * For now only support private anonymous when >> - * migrating to un-addressable device memory. >> + * For now only support private and coherent >> + * anonymous when migrating to device memory. >> */ >> if (mapping) { >> migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; >> > >