On Tuesday, 16 November 2021 6:30:18 AM AEDT Alex Sierra wrote: > Device memory that is cache coherent from device and CPU point of view. > This is used on platforms that have an advanced system bus (like CAPI > or CXL). Any page of a process can be migrated to such memory. However, > no one should be allowed to pin such memory so that it can always be > evicted. > > Signed-off-by: Alex Sierra <alex.sierra@xxxxxxx> > --- > include/linux/memremap.h | 8 ++++++++ > include/linux/mm.h | 9 +++++++++ > mm/memcontrol.c | 6 +++--- > mm/memory-failure.c | 6 +++++- > mm/memremap.c | 5 ++++- > mm/migrate.c | 21 +++++++++++++-------- > 6 files changed, 42 insertions(+), 13 deletions(-) > > diff --git a/include/linux/memremap.h b/include/linux/memremap.h > index c0e9d35889e8..ff4d398edf35 100644 > --- a/include/linux/memremap.h > +++ b/include/linux/memremap.h > @@ -39,6 +39,13 @@ struct vmem_altmap { > * A more complete discussion of unaddressable memory may be found in > * include/linux/hmm.h and Documentation/vm/hmm.rst. > * > + * MEMORY_DEVICE_COHERENT: > + * Device memory that is cache coherent from device and CPU point of view. This > + * is used on platforms that have an advanced system bus (like CAPI or CXL). A > + * driver can hotplug the device memory using ZONE_DEVICE and with that memory > + * type. Any page of a process can be migrated to such memory. However no one > + * should be allowed to pin such memory so that it can always be evicted. > + * > * MEMORY_DEVICE_FS_DAX: > * Host memory that has similar access semantics as System RAM i.e. DMA > * coherent and supports page pinning. In support of coordinating page > @@ -59,6 +66,7 @@ struct vmem_altmap { > enum memory_type { > /* 0 is reserved to catch uninitialized type fields */ > MEMORY_DEVICE_PRIVATE = 1, > + MEMORY_DEVICE_COHERENT, > MEMORY_DEVICE_FS_DAX, > MEMORY_DEVICE_GENERIC, > MEMORY_DEVICE_PCI_P2PDMA, > diff --git a/include/linux/mm.h b/include/linux/mm.h > index 73a52aba448f..d23b6ebd2177 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -1162,6 +1162,7 @@ static inline bool page_is_devmap_managed(struct page *page) > return false; > switch (page->pgmap->type) { > case MEMORY_DEVICE_PRIVATE: > + case MEMORY_DEVICE_COHERENT: > case MEMORY_DEVICE_FS_DAX: > return true; > default: > @@ -1191,6 +1192,14 @@ static inline bool is_device_private_page(const struct page *page) > page->pgmap->type == MEMORY_DEVICE_PRIVATE; > } > > +static inline bool is_device_page(const struct page *page) > +{ > + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && > + is_zone_device_page(page) && > + (page->pgmap->type == MEMORY_DEVICE_PRIVATE || > + page->pgmap->type == MEMORY_DEVICE_COHERENT); > +} > + > static inline bool is_pci_p2pdma_page(const struct page *page) > { > return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 6da5020a8656..e0275ebe1198 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -5695,8 +5695,8 @@ static int mem_cgroup_move_account(struct page *page, > * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a > * target for charge migration. if @target is not NULL, the entry is stored > * in target->ent. > - * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE > - * (so ZONE_DEVICE page and thus not on the lru). > + * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_COHERENT > + * or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru). > * For now we such page is charge like a regular page would be as for all > * intent and purposes it is just special memory taking the place of a > * regular page. > @@ -5730,7 +5730,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, > */ > if (page_memcg(page) == mc.from) { > ret = MC_TARGET_PAGE; > - if (is_device_private_page(page)) > + if (is_device_page(page)) > ret = MC_TARGET_DEVICE; > if (target) > target->page = page; > diff --git a/mm/memory-failure.c b/mm/memory-failure.c > index 3e6449f2102a..51b55fc5172c 100644 > --- a/mm/memory-failure.c > +++ b/mm/memory-failure.c > @@ -1554,12 +1554,16 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, > goto unlock; > } > > - if (pgmap->type == MEMORY_DEVICE_PRIVATE) { > + switch (pgmap->type) { > + case MEMORY_DEVICE_PRIVATE: > + case MEMORY_DEVICE_COHERENT: > /* > * TODO: Handle HMM pages which may need coordination > * with device-side memory. > */ > goto unlock; > + default: > + break; > } > > /* > diff --git a/mm/memremap.c b/mm/memremap.c > index ed593bf87109..94d6a1e01d42 100644 > --- a/mm/memremap.c > +++ b/mm/memremap.c > @@ -44,6 +44,7 @@ EXPORT_SYMBOL(devmap_managed_key); > static void devmap_managed_enable_put(struct dev_pagemap *pgmap) > { > if (pgmap->type == MEMORY_DEVICE_PRIVATE || > + pgmap->type == MEMORY_DEVICE_COHERENT || > pgmap->type == MEMORY_DEVICE_FS_DAX) > static_branch_dec(&devmap_managed_key); > } > @@ -51,6 +52,7 @@ static void devmap_managed_enable_put(struct dev_pagemap *pgmap) > static void devmap_managed_enable_get(struct dev_pagemap *pgmap) > { > if (pgmap->type == MEMORY_DEVICE_PRIVATE || > + pgmap->type == MEMORY_DEVICE_COHERENT || > pgmap->type == MEMORY_DEVICE_FS_DAX) > static_branch_inc(&devmap_managed_key); > } > @@ -328,6 +330,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) > > switch (pgmap->type) { > case MEMORY_DEVICE_PRIVATE: > + case MEMORY_DEVICE_COHERENT: > if (!IS_ENABLED(CONFIG_DEVICE_PRIVATE)) { > WARN(1, "Device private memory not supported\n"); > return ERR_PTR(-EINVAL); > @@ -498,7 +501,7 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap); > void free_devmap_managed_page(struct page *page) > { > /* notify page idle for dax */ > - if (!is_device_private_page(page)) { > + if (!is_device_page(page)) { > wake_up_var(&page->_refcount); > return; > } > diff --git a/mm/migrate.c b/mm/migrate.c > index 1852d787e6ab..f74422a42192 100644 > --- a/mm/migrate.c > +++ b/mm/migrate.c > @@ -362,7 +362,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page) > * Device private pages have an extra refcount as they are > * ZONE_DEVICE pages. > */ > - expected_count += is_device_private_page(page); > + expected_count += is_device_page(page); > if (mapping) > expected_count += thp_nr_pages(page) + page_has_private(page); > > @@ -2503,7 +2503,7 @@ static bool migrate_vma_check_page(struct page *page) > * FIXME proper solution is to rework migration_entry_wait() so > * it does not need to take a reference on page. > */ Note that I have posted a patch to fix this - see https://lore.kernel.org/all/20211118020754.954425-1-apopple@xxxxxxxxxx/. This looks ok for now assuming coherent pages can never be pinned. However that raises a question - what happens when something calls get_user_pages() on a pfn pointing to a coherent device page? I can't see anything in this series that prevents pinning of coherent device pages, so we can't just assume they aren't pinned. In the case of device-private pages this is enforced by the fact they never have present pte's, so any attempt to GUP them results in a fault. But if I'm understanding this series correctly that won't be the case for coherent device pages right? > - return is_device_private_page(page); > + return is_device_page(page); > } > > /* For file back page */ > @@ -2791,7 +2791,7 @@ EXPORT_SYMBOL(migrate_vma_setup); > * handle_pte_fault() > * do_anonymous_page() > * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE > - * private page. > + * private or coherent page. > */ > static void migrate_vma_insert_page(struct migrate_vma *migrate, > unsigned long addr, > @@ -2867,10 +2867,15 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, > swp_entry = make_readable_device_private_entry( > page_to_pfn(page)); > entry = swp_entry_to_pte(swp_entry); > + } else if (is_device_page(page)) { How about adding an explicit `is_device_coherent_page()` helper? It would make the test more explicit that this is expected to handle just coherent pages and I bet there will be future changes that need to differentiate between private and coherent pages anyway. > + entry = pte_mkold(mk_pte(page, > + READ_ONCE(vma->vm_page_prot))); > + if (vma->vm_flags & VM_WRITE) > + entry = pte_mkwrite(pte_mkdirty(entry)); > } else { > /* > - * For now we only support migrating to un-addressable > - * device memory. > + * We support migrating to private and coherent types > + * for device zone memory. > */ > pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); > goto abort; > @@ -2976,10 +2981,10 @@ void migrate_vma_pages(struct migrate_vma *migrate) > mapping = page_mapping(page); > > if (is_zone_device_page(newpage)) { > - if (is_device_private_page(newpage)) { > + if (is_device_page(newpage)) { > /* > - * For now only support private anonymous when > - * migrating to un-addressable device memory. > + * For now only support private and coherent > + * anonymous when migrating to device memory. > */ > if (mapping) { > migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; >