On Tuesday, 14 September 2021 2:16:02 AM AEST Alex Sierra wrote: > Device Public type uses device memory that is coherently accesible by > the CPU. This could be shown as SP (special purpose) memory range > at the BIOS-e820 memory enumeration. If no SP memory is supported in > system, this could be faked by setting CONFIG_EFI_FAKE_MEMMAP. > > Currently, test_hmm only supports two different SP ranges of at least > 256MB size. This could be specified in the kernel parameter variable > efi_fake_mem. Ex. Two SP ranges of 1GB starting at 0x100000000 & > 0x140000000 physical address. Ex. > efi_fake_mem=1G@0x100000000:0x40000,1G@0x140000000:0x40000 > > Signed-off-by: Alex Sierra <alex.sierra@xxxxxxx> > --- > lib/test_hmm.c | 166 +++++++++++++++++++++++++++----------------- > lib/test_hmm_uapi.h | 10 ++- > 2 files changed, 113 insertions(+), 63 deletions(-) > > diff --git a/lib/test_hmm.c b/lib/test_hmm.c > index ef27e355738a..e346a48e2509 100644 > --- a/lib/test_hmm.c > +++ b/lib/test_hmm.c > @@ -469,6 +469,7 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice, > unsigned long pfn_first; > unsigned long pfn_last; > void *ptr; > + int ret = -ENOMEM; > > devmem = kzalloc(sizeof(*devmem), GFP_KERNEL); > if (!devmem) > @@ -551,7 +552,7 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice, > } > spin_unlock(&mdevice->lock); > > - return true; > + return 0; > > err_release: > mutex_unlock(&mdevice->devmem_lock); > @@ -560,7 +561,7 @@ static int dmirror_allocate_chunk(struct dmirror_device *mdevice, > err_devmem: > kfree(devmem); > > - return false; > + return ret; > } > > static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) > @@ -569,8 +570,10 @@ static struct page *dmirror_devmem_alloc_page(struct dmirror_device *mdevice) > struct page *rpage; > > /* > - * This is a fake device so we alloc real system memory to store > - * our device memory. > + * For ZONE_DEVICE private type, this is a fake device so we alloc real > + * system memory to store our device memory. > + * For ZONE_DEVICE public type we use the actual dpage to store the data > + * and ignore rpage. > */ > rpage = alloc_page(GFP_HIGHUSER); > if (!rpage) > @@ -603,7 +606,7 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, > struct dmirror *dmirror) > { > struct dmirror_device *mdevice = dmirror->mdevice; > - const unsigned long *src = args->src; > + unsigned long *src = args->src; > unsigned long *dst = args->dst; > unsigned long addr; > > @@ -621,12 +624,18 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, > * unallocated pte_none() or read-only zero page. > */ > spage = migrate_pfn_to_page(*src); > - > + if (spage && is_zone_device_page(spage)) { > + pr_debug("page already in device spage pfn: 0x%lx\n", > + page_to_pfn(spage)); > + *src &= ~MIGRATE_PFN_MIGRATE; I don't think this is quite correct, callers shouldn't modify the src array. To mark a page as not migrating callers need to set *dst = 0. However I think this should be considered a test failure anyway. If we are migrating from system to device memory we would have set MIGRATE_VMA_SELECT_SYSTEM meaning no device private pages should be returned. Therefore I don't think we can reach this unless there is a bug right? > + continue; > + } > dpage = dmirror_devmem_alloc_page(mdevice); > if (!dpage) > continue; > > - rpage = dpage->zone_device_data; > + rpage = is_device_private_page(dpage) ? dpage->zone_device_data : > + dpage; > if (spage) > copy_highpage(rpage, spage); > else > @@ -638,8 +647,10 @@ static void dmirror_migrate_alloc_and_copy(struct migrate_vma *args, > * the simulated device memory and that page holds the pointer > * to the mirror. > */ > + rpage = dpage->zone_device_data; > rpage->zone_device_data = dmirror; > - > + pr_debug("migrating from sys to dev pfn src: 0x%lx pfn dst: 0x%lx\n", > + page_to_pfn(spage), page_to_pfn(dpage)); > *dst = migrate_pfn(page_to_pfn(dpage)) | > MIGRATE_PFN_LOCKED; > if ((*src & MIGRATE_PFN_WRITE) || > @@ -673,10 +684,13 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args, > continue; > > /* > - * Store the page that holds the data so the page table > - * doesn't have to deal with ZONE_DEVICE private pages. > + * For ZONE_DEVICE private pages we store the page that > + * holds the data so the page table doesn't have to deal it. > + * For ZONE_DEVICE public pages we store the actual page, since > + * the CPU has coherent access to the page. > */ > - entry = dpage->zone_device_data; > + entry = is_device_private_page(dpage) ? dpage->zone_device_data : > + dpage; > if (*dst & MIGRATE_PFN_WRITE) > entry = xa_tag_pointer(entry, DPT_XA_TAG_WRITE); > entry = xa_store(&dmirror->pt, pfn, entry, GFP_ATOMIC); > @@ -690,6 +704,47 @@ static int dmirror_migrate_finalize_and_map(struct migrate_vma *args, > return 0; > } > > +static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, > + struct dmirror *dmirror) > +{ > + unsigned long *src = args->src; > + unsigned long *dst = args->dst; > + unsigned long start = args->start; > + unsigned long end = args->end; > + unsigned long addr; > + > + for (addr = start; addr < end; addr += PAGE_SIZE, > + src++, dst++) { > + struct page *dpage, *spage; > + > + spage = migrate_pfn_to_page(*src); > + if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) > + continue; > + if (is_device_private_page(spage)) { > + spage = spage->zone_device_data; > + } else { > + pr_debug("page already in system or SPM spage pfn: 0x%lx\n", > + page_to_pfn(spage)); > + *src &= ~MIGRATE_PFN_MIGRATE; Same comment as above - shouldn't touch *src and also shouldn't be able to get here anyway. > + continue; > + } > + dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); > + if (!dpage) > + continue; > + pr_debug("migrating from dev to sys pfn src: 0x%lx pfn dst: 0x%lx\n", > + page_to_pfn(spage), page_to_pfn(dpage)); > + > + lock_page(dpage); > + xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); > + copy_highpage(dpage, spage); > + *dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; > + if (*src & MIGRATE_PFN_WRITE) > + *dst |= MIGRATE_PFN_WRITE; > + } > + return 0; > +} > + > + > static int dmirror_migrate(struct dmirror *dmirror, > struct hmm_dmirror_cmd *cmd) > { > @@ -731,33 +786,46 @@ static int dmirror_migrate(struct dmirror *dmirror, > args.start = addr; > args.end = next; > args.pgmap_owner = dmirror->mdevice; > - args.flags = MIGRATE_VMA_SELECT_SYSTEM; > + args.flags = (!cmd->alloc_to_devmem && > + dmirror->mdevice->zone_device_type == > + HMM_DMIRROR_MEMORY_DEVICE_PRIVATE) ? > + MIGRATE_VMA_SELECT_DEVICE_PRIVATE : > + MIGRATE_VMA_SELECT_SYSTEM; > ret = migrate_vma_setup(&args); > if (ret) > goto out; > > - dmirror_migrate_alloc_and_copy(&args, dmirror); > + if (cmd->alloc_to_devmem) { > + pr_debug("Migrating from sys mem to device mem\n"); > + dmirror_migrate_alloc_and_copy(&args, dmirror); > + } else { > + pr_debug("Migrating from device mem to sys mem\n"); > + dmirror_devmem_fault_alloc_and_copy(&args, dmirror); > + } > migrate_vma_pages(&args); > - dmirror_migrate_finalize_and_map(&args, dmirror); > + if (cmd->alloc_to_devmem) > + dmirror_migrate_finalize_and_map(&args, dmirror); > migrate_vma_finalize(&args); > } > mmap_read_unlock(mm); > mmput(mm); > > - /* Return the migrated data for verification. */ > - ret = dmirror_bounce_init(&bounce, start, size); > - if (ret) > - return ret; > - mutex_lock(&dmirror->mutex); > - ret = dmirror_do_read(dmirror, start, end, &bounce); > - mutex_unlock(&dmirror->mutex); > - if (ret == 0) { > - if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, > - bounce.size)) > - ret = -EFAULT; > + /* Return the migrated data for verification. only for pages in device zone */ > + if (cmd->alloc_to_devmem) { > + ret = dmirror_bounce_init(&bounce, start, size); > + if (ret) > + return ret; > + mutex_lock(&dmirror->mutex); > + ret = dmirror_do_read(dmirror, start, end, &bounce); > + mutex_unlock(&dmirror->mutex); > + if (ret == 0) { > + if (copy_to_user(u64_to_user_ptr(cmd->ptr), bounce.ptr, > + bounce.size)) > + ret = -EFAULT; > + } > + cmd->cpages = bounce.cpages; > + dmirror_bounce_fini(&bounce); > } > - cmd->cpages = bounce.cpages; > - dmirror_bounce_fini(&bounce); > return ret; Rather than passing a flag (alloc_to_devmem) can you split this into two functions - one to migrate to device memory and one to migrate to system memory? > out: > @@ -781,9 +849,15 @@ static void dmirror_mkentry(struct dmirror *dmirror, struct hmm_range *range, > } > > page = hmm_pfn_to_page(entry); > - if (is_device_private_page(page)) { > - /* Is the page migrated to this device or some other? */ > - if (dmirror->mdevice == dmirror_page_to_device(page)) > + if (is_device_page(page)) { > + /* Is page ZONE_DEVICE public? */ > + if (!is_device_private_page(page)) > + *perm = HMM_DMIRROR_PROT_DEV_PUBLIC; > + /* > + * Is page ZONE_DEVICE private migrated to > + * this device or some other? > + */ > + else if (dmirror->mdevice == dmirror_page_to_device(page)) > *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL; > else > *perm = HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE; > @@ -1030,38 +1104,6 @@ static void dmirror_devmem_free(struct page *page) > spin_unlock(&mdevice->lock); > } > > -static vm_fault_t dmirror_devmem_fault_alloc_and_copy(struct migrate_vma *args, > - struct dmirror *dmirror) > -{ > - const unsigned long *src = args->src; > - unsigned long *dst = args->dst; > - unsigned long start = args->start; > - unsigned long end = args->end; > - unsigned long addr; > - > - for (addr = start; addr < end; addr += PAGE_SIZE, > - src++, dst++) { > - struct page *dpage, *spage; > - > - spage = migrate_pfn_to_page(*src); > - if (!spage || !(*src & MIGRATE_PFN_MIGRATE)) > - continue; > - spage = spage->zone_device_data; > - > - dpage = alloc_page_vma(GFP_HIGHUSER_MOVABLE, args->vma, addr); > - if (!dpage) > - continue; > - > - lock_page(dpage); > - xa_erase(&dmirror->pt, addr >> PAGE_SHIFT); > - copy_highpage(dpage, spage); > - *dst = migrate_pfn(page_to_pfn(dpage)) | MIGRATE_PFN_LOCKED; > - if (*src & MIGRATE_PFN_WRITE) > - *dst |= MIGRATE_PFN_WRITE; > - } > - return 0; > -} > - > static vm_fault_t dmirror_devmem_fault(struct vm_fault *vmf) > { > struct migrate_vma args; > diff --git a/lib/test_hmm_uapi.h b/lib/test_hmm_uapi.h > index 00259d994410..b6cb8a7d2470 100644 > --- a/lib/test_hmm_uapi.h > +++ b/lib/test_hmm_uapi.h > @@ -17,8 +17,12 @@ > * @addr: (in) user address the device will read/write > * @ptr: (in) user address where device data is copied to/from > * @npages: (in) number of pages to read/write > + * @alloc_to_devmem: (in) desired allocation destination during migration. > + * True if allocation is to device memory. > + * False if allocation is to system memory. > * @cpages: (out) number of pages copied > * @faults: (out) number of device page faults seen > + * @zone_device_type: (out) zone device memory type > */ > struct hmm_dmirror_cmd { > __u64 addr; > @@ -26,7 +30,8 @@ struct hmm_dmirror_cmd { > __u64 npages; > __u64 cpages; > __u64 faults; > - __u64 zone_device_type; > + __u32 zone_device_type; > + __u32 alloc_to_devmem; Similar comment here. Rather than add a boolean flag to every command could you rename the existing command to HMM_DMIRROR_MIGRATE_TO_DEV and add another (HMM_DMIRROR_MIGRATE_TO_SYS) for this new operation? I think that would end up being a bit cleaner and matches how this actually gets used in hmm-test.c where you end up defining hmm_migrate_sys_to_dev() and hmm_migrate_to_dev_sys() anyway. - Alistair > }; > > /* Expose the address space of the calling process through hmm device file */ > @@ -49,6 +54,8 @@ struct hmm_dmirror_cmd { > * device the ioctl() is made > * HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE: Migrated device private page on some > * other device > + * HMM_DMIRROR_PROT_DEV_PUBLIC: Migrate device public page on the device > + * the ioctl() is made > */ > enum { > HMM_DMIRROR_PROT_ERROR = 0xFF, > @@ -60,6 +67,7 @@ enum { > HMM_DMIRROR_PROT_ZERO = 0x10, > HMM_DMIRROR_PROT_DEV_PRIVATE_LOCAL = 0x20, > HMM_DMIRROR_PROT_DEV_PRIVATE_REMOTE = 0x30, > + HMM_DMIRROR_PROT_DEV_PUBLIC = 0x40, > }; > > enum { >