Looks good, feel free to add: Reviewed-by: Alistair Popple <apopple@xxxxxxxxxx> On Saturday, 29 January 2022 7:08:16 AM AEDT Alex Sierra wrote: > Device memory that is cache coherent from device and CPU point of view. > This is used on platforms that have an advanced system bus (like CAPI > or CXL). Any page of a process can be migrated to such memory. However, > no one should be allowed to pin such memory so that it can always be > evicted. > > Signed-off-by: Alex Sierra <alex.sierra@xxxxxxx> > Acked-by: Felix Kuehling <Felix.Kuehling@xxxxxxx> > --- > v4: > - use the same system entry path for coherent device pages at > migrate_vma_insert_page. > > - Add coherent device type support for try_to_migrate / > try_to_migrate_one. > --- > include/linux/memremap.h | 8 +++++++ > include/linux/mm.h | 16 ++++++++++++++ > mm/memcontrol.c | 6 +++--- > mm/memory-failure.c | 8 +++++-- > mm/memremap.c | 14 ++++++++++++- > mm/migrate.c | 45 ++++++++++++++++++++-------------------- > mm/rmap.c | 5 +++-- > 7 files changed, 71 insertions(+), 31 deletions(-) > > diff --git a/include/linux/memremap.h b/include/linux/memremap.h > index 1fafcc38acba..727b8c789193 100644 > --- a/include/linux/memremap.h > +++ b/include/linux/memremap.h > @@ -39,6 +39,13 @@ struct vmem_altmap { > * A more complete discussion of unaddressable memory may be found in > * include/linux/hmm.h and Documentation/vm/hmm.rst. > * > + * MEMORY_DEVICE_COHERENT: > + * Device memory that is cache coherent from device and CPU point of view. This > + * is used on platforms that have an advanced system bus (like CAPI or CXL). A > + * driver can hotplug the device memory using ZONE_DEVICE and with that memory > + * type. Any page of a process can be migrated to such memory. However no one > + * should be allowed to pin such memory so that it can always be evicted. > + * > * MEMORY_DEVICE_FS_DAX: > * Host memory that has similar access semantics as System RAM i.e. DMA > * coherent and supports page pinning. In support of coordinating page > @@ -59,6 +66,7 @@ struct vmem_altmap { > enum memory_type { > /* 0 is reserved to catch uninitialized type fields */ > MEMORY_DEVICE_PRIVATE = 1, > + MEMORY_DEVICE_COHERENT, > MEMORY_DEVICE_FS_DAX, > MEMORY_DEVICE_GENERIC, > MEMORY_DEVICE_PCI_P2PDMA, > diff --git a/include/linux/mm.h b/include/linux/mm.h > index e1a84b1e6787..0c61bf40edef 100644 > --- a/include/linux/mm.h > +++ b/include/linux/mm.h > @@ -1106,6 +1106,7 @@ static inline bool page_is_devmap_managed(struct page *page) > return false; > switch (page->pgmap->type) { > case MEMORY_DEVICE_PRIVATE: > + case MEMORY_DEVICE_COHERENT: > case MEMORY_DEVICE_FS_DAX: > return true; > default: > @@ -1135,6 +1136,21 @@ static inline bool is_device_private_page(const struct page *page) > page->pgmap->type == MEMORY_DEVICE_PRIVATE; > } > > +static inline bool is_device_coherent_page(const struct page *page) > +{ > + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && > + is_zone_device_page(page) && > + page->pgmap->type == MEMORY_DEVICE_COHERENT; > +} > + > +static inline bool is_dev_private_or_coherent_page(const struct page *page) > +{ > + return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && > + is_zone_device_page(page) && > + (page->pgmap->type == MEMORY_DEVICE_PRIVATE || > + page->pgmap->type == MEMORY_DEVICE_COHERENT); > +} > + > static inline bool is_pci_p2pdma_page(const struct page *page) > { > return IS_ENABLED(CONFIG_DEV_PAGEMAP_OPS) && > diff --git a/mm/memcontrol.c b/mm/memcontrol.c > index 09d342c7cbd0..0882b5b2a857 100644 > --- a/mm/memcontrol.c > +++ b/mm/memcontrol.c > @@ -5691,8 +5691,8 @@ static int mem_cgroup_move_account(struct page *page, > * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a > * target for charge migration. if @target is not NULL, the entry is stored > * in target->ent. > - * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is MEMORY_DEVICE_PRIVATE > - * (so ZONE_DEVICE page and thus not on the lru). > + * 3(MC_TARGET_DEVICE): like MC_TARGET_PAGE but page is device memory and > + * thus not on the lru. > * For now we such page is charge like a regular page would be as for all > * intent and purposes it is just special memory taking the place of a > * regular page. > @@ -5726,7 +5726,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma, > */ > if (page_memcg(page) == mc.from) { > ret = MC_TARGET_PAGE; > - if (is_device_private_page(page)) > + if (is_dev_private_or_coherent_page(page)) > ret = MC_TARGET_DEVICE; > if (target) > target->page = page; > diff --git a/mm/memory-failure.c b/mm/memory-failure.c > index 14ae5c18e776..e83740f7f05e 100644 > --- a/mm/memory-failure.c > +++ b/mm/memory-failure.c > @@ -1611,12 +1611,16 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags, > goto unlock; > } > > - if (pgmap->type == MEMORY_DEVICE_PRIVATE) { > + switch (pgmap->type) { > + case MEMORY_DEVICE_PRIVATE: > + case MEMORY_DEVICE_COHERENT: > /* > - * TODO: Handle HMM pages which may need coordination > + * TODO: Handle device pages which may need coordination > * with device-side memory. > */ > goto unlock; > + default: > + break; > } > > /* > diff --git a/mm/memremap.c b/mm/memremap.c > index 6aa5f0c2d11f..354c8027823f 100644 > --- a/mm/memremap.c > +++ b/mm/memremap.c > @@ -44,6 +44,7 @@ EXPORT_SYMBOL(devmap_managed_key); > static void devmap_managed_enable_put(struct dev_pagemap *pgmap) > { > if (pgmap->type == MEMORY_DEVICE_PRIVATE || > + pgmap->type == MEMORY_DEVICE_COHERENT || > pgmap->type == MEMORY_DEVICE_FS_DAX) > static_branch_dec(&devmap_managed_key); > } > @@ -51,6 +52,7 @@ static void devmap_managed_enable_put(struct dev_pagemap *pgmap) > static void devmap_managed_enable_get(struct dev_pagemap *pgmap) > { > if (pgmap->type == MEMORY_DEVICE_PRIVATE || > + pgmap->type == MEMORY_DEVICE_COHERENT || > pgmap->type == MEMORY_DEVICE_FS_DAX) > static_branch_inc(&devmap_managed_key); > } > @@ -327,6 +329,16 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid) > return ERR_PTR(-EINVAL); > } > break; > + case MEMORY_DEVICE_COHERENT: > + if (!pgmap->ops->page_free) { > + WARN(1, "Missing page_free method\n"); > + return ERR_PTR(-EINVAL); > + } > + if (!pgmap->owner) { > + WARN(1, "Missing owner\n"); > + return ERR_PTR(-EINVAL); > + } > + break; > case MEMORY_DEVICE_FS_DAX: > if (!IS_ENABLED(CONFIG_ZONE_DEVICE) || > IS_ENABLED(CONFIG_FS_DAX_LIMITED)) { > @@ -469,7 +481,7 @@ EXPORT_SYMBOL_GPL(get_dev_pagemap); > void free_devmap_managed_page(struct page *page) > { > /* notify page idle for dax */ > - if (!is_device_private_page(page)) { > + if (!is_dev_private_or_coherent_page(page)) { > wake_up_var(&page->_refcount); > return; > } > diff --git a/mm/migrate.c b/mm/migrate.c > index c7da064b4781..cd137aedcfe5 100644 > --- a/mm/migrate.c > +++ b/mm/migrate.c > @@ -345,7 +345,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page) > * Device private pages have an extra refcount as they are > * ZONE_DEVICE pages. > */ > - expected_count += is_device_private_page(page); > + expected_count += is_dev_private_or_coherent_page(page); > if (mapping) > expected_count += compound_nr(page) + page_has_private(page); > > @@ -2611,7 +2611,7 @@ EXPORT_SYMBOL(migrate_vma_setup); > * handle_pte_fault() > * do_anonymous_page() > * to map in an anonymous zero page but the struct page will be a ZONE_DEVICE > - * private page. > + * private or coherent page. > */ > static void migrate_vma_insert_page(struct migrate_vma *migrate, > unsigned long addr, > @@ -2676,25 +2676,24 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate, > */ > __SetPageUptodate(page); > > - if (is_zone_device_page(page)) { > - if (is_device_private_page(page)) { > - swp_entry_t swp_entry; > + if (is_device_private_page(page)) { > + swp_entry_t swp_entry; > > - if (vma->vm_flags & VM_WRITE) > - swp_entry = make_writable_device_private_entry( > - page_to_pfn(page)); > - else > - swp_entry = make_readable_device_private_entry( > - page_to_pfn(page)); > - entry = swp_entry_to_pte(swp_entry); > - } else { > - /* > - * For now we only support migrating to un-addressable > - * device memory. > - */ > - pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); > - goto abort; > - } > + if (vma->vm_flags & VM_WRITE) > + swp_entry = make_writable_device_private_entry( > + page_to_pfn(page)); > + else > + swp_entry = make_readable_device_private_entry( > + page_to_pfn(page)); > + entry = swp_entry_to_pte(swp_entry); > + } else if (is_zone_device_page(page) && > + !is_device_coherent_page(page)) { > + /* > + * We support migrating to private and coherent types > + * for device zone memory. > + */ > + pr_warn_once("Unsupported ZONE_DEVICE page type.\n"); > + goto abort; > } else { > entry = mk_pte(page, vma->vm_page_prot); > if (vma->vm_flags & VM_WRITE) > @@ -2796,10 +2795,10 @@ void migrate_vma_pages(struct migrate_vma *migrate) > mapping = page_mapping(page); > > if (is_zone_device_page(newpage)) { > - if (is_device_private_page(newpage)) { > + if (is_dev_private_or_coherent_page(newpage)) { > /* > - * For now only support private anonymous when > - * migrating to un-addressable device memory. > + * For now only support private and coherent > + * anonymous when migrating to device memory. > */ > if (mapping) { > migrate->src[i] &= ~MIGRATE_PFN_MIGRATE; > diff --git a/mm/rmap.c b/mm/rmap.c > index 6a1e8c7f6213..f2bd5a3da72c 100644 > --- a/mm/rmap.c > +++ b/mm/rmap.c > @@ -1835,7 +1835,7 @@ static bool try_to_migrate_one(struct page *page, struct vm_area_struct *vma, > /* Update high watermark before we lower rss */ > update_hiwater_rss(mm); > > - if (is_zone_device_page(page)) { > + if (is_device_private_page(page)) { > unsigned long pfn = page_to_pfn(page); > swp_entry_t entry; > pte_t swp_pte; > @@ -1976,7 +1976,8 @@ void try_to_migrate(struct page *page, enum ttu_flags flags) > TTU_SYNC))) > return; > > - if (is_zone_device_page(page) && !is_device_private_page(page)) > + if (is_zone_device_page(page) && > + !is_dev_private_or_coherent_page(page)) > return; > > /* >