On Tue, Mar 17, 2020 at 04:14:31PM -0700, Ralph Campbell wrote: > > On 3/17/20 5:59 AM, Christoph Hellwig wrote: > > On Tue, Mar 17, 2020 at 09:47:55AM -0300, Jason Gunthorpe wrote: > > > I've been using v7 of Ralph's tester and it is working well - it has > > > DEVICE_PRIVATE support so I think it can test this flow too. Ralph are > > > you able? > > > > > > This hunk seems trivial enough to me, can we include it now? > > > > I can send a separate patch for it once the tester covers it. I don't > > want to add it to the original patch as it is a significant behavior > > change compared to the existing code. > > > > Attached is an updated version of my HMM tests based on linux-5.6.0-rc6. > I ran this OK with Jason's 8+1 HMM patches, Christoph's 1-5 misc HMM clean ups, > and Christoph's 1-4 device private page changes applied. I'd like to get this to mergable, it looks pretty good now, but I have no idea about selftests - and I'm struggling to even compile the tools dir > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug > index 69def4a9df00..4d22ce7879a7 100644 > +++ b/lib/Kconfig.debug > @@ -2162,6 +2162,18 @@ config TEST_MEMINIT > > If unsure, say N. > > +config TEST_HMM > + tristate "Test HMM (Heterogeneous Memory Management)" > + depends on DEVICE_PRIVATE > + select HMM_MIRROR > + select MMU_NOTIFIER extra spaces In general I wonder if it even makes sense that DEVICE_PRIVATE is user selectable? > +static int dmirror_fops_open(struct inode *inode, struct file *filp) > +{ > + struct cdev *cdev = inode->i_cdev; > + struct dmirror *dmirror; > + int ret; > + > + /* Mirror this process address space */ > + dmirror = kzalloc(sizeof(*dmirror), GFP_KERNEL); > + if (dmirror == NULL) > + return -ENOMEM; > + > + dmirror->mdevice = container_of(cdev, struct dmirror_device, cdevice); > + mutex_init(&dmirror->mutex); > + xa_init(&dmirror->pt); > + > + ret = mmu_interval_notifier_insert(&dmirror->notifier, current->mm, > + 0, ULONG_MAX & PAGE_MASK, &dmirror_min_ops); > + if (ret) { > + kfree(dmirror); > + return ret; > + } > + > + /* Pairs with the mmdrop() in dmirror_fops_release(). */ > + mmgrab(current->mm); > + dmirror->mm = current->mm; The notifier holds a mmgrab, no need for another one > + /* Only the first open registers the address space. */ > + filp->private_data = dmirror; Not sure what this comment means > +static inline struct dmirror_device *dmirror_page_to_device(struct page *page) > + > +{ > + struct dmirror_chunk *devmem; > + > + devmem = container_of(page->pgmap, struct dmirror_chunk, pagemap); > + return devmem->mdevice; > +} extra devmem var is not really needed > + > +static bool dmirror_device_is_mine(struct dmirror_device *mdevice, > + struct page *page) > +{ > + if (!is_zone_device_page(page)) > + return false; > + return page->pgmap->ops == &dmirror_devmem_ops && > + dmirror_page_to_device(page) == mdevice; > +} Use new owner stuff, right? Actually this is redunant now, the check should be just WARN_ON pageowner != self owner > +static int dmirror_do_fault(struct dmirror *dmirror, struct hmm_range *range) > +{ > + uint64_t *pfns = range->pfns; > + unsigned long pfn; > + > + for (pfn = (range->start >> PAGE_SHIFT); > + pfn < (range->end >> PAGE_SHIFT); > + pfn++, pfns++) { > + struct page *page; > + void *entry; > + > + /* > + * HMM_PFN_ERROR is returned if it is accessing invalid memory > + * either because of memory error (hardware detected memory > + * corruption) or more likely because of truncate on mmap > + * file. > + */ > + if (*pfns == range->values[HMM_PFN_ERROR]) > + return -EFAULT; Unless that snapshot is use hmm_range_fault() never returns success and sets PFN_ERROR, so this should be a WARN_ON > + if (!(*pfns & range->flags[HMM_PFN_VALID])) > + return -EFAULT; Same with valid. > + page = hmm_device_entry_to_page(range, *pfns); > + /* We asked for pages to be populated but check anyway. */ > + if (!page) > + return -EFAULT; WARN_ON > + if (is_zone_device_page(page)) { > + /* > + * TODO: need a way to ask HMM to fault foreign zone > + * device private pages. > + */ > + if (!dmirror_device_is_mine(dmirror->mdevice, page)) > + continue; Actually re > +static bool dmirror_interval_invalidate(struct mmu_interval_notifier *mni, > + const struct mmu_notifier_range *range, > + unsigned long cur_seq) > +{ > + struct dmirror *dmirror = container_of(mni, struct dmirror, notifier); > + struct mm_struct *mm = dmirror->mm; > + > + /* > + * If the process doesn't exist, we don't need to invalidate the > + * device page table since the address space will be torn down. > + */ > + if (!mmget_not_zero(mm)) > + return true; Why? Don't the notifiers provide for this already. mmget_not_zero() is required before calling hmm_range_fault() though > +static int dmirror_fault(struct dmirror *dmirror, unsigned long start, > + unsigned long end, bool write) > +{ > + struct mm_struct *mm = dmirror->mm; > + unsigned long addr; > + uint64_t pfns[64]; > + struct hmm_range range = { > + .notifier = &dmirror->notifier, > + .pfns = pfns, > + .flags = dmirror_hmm_flags, > + .values = dmirror_hmm_values, > + .pfn_shift = DPT_SHIFT, > + .pfn_flags_mask = ~(dmirror_hmm_flags[HMM_PFN_VALID] | > + dmirror_hmm_flags[HMM_PFN_WRITE]), > + .default_flags = dmirror_hmm_flags[HMM_PFN_VALID] | > + (write ? dmirror_hmm_flags[HMM_PFN_WRITE] : 0), > + .dev_private_owner = dmirror->mdevice, > + }; > + int ret = 0; > + > + /* Since the mm is for the mirrored process, get a reference first. */ > + if (!mmget_not_zero(mm)) > + return 0; Right > + for (addr = start; addr < end; addr = range.end) { > + range.start = addr; > + range.end = min(addr + (ARRAY_SIZE(pfns) << PAGE_SHIFT), end); > + > + ret = dmirror_range_fault(dmirror, &range); > + if (ret) > + break; > + } > + > + mmput(mm); > + return ret; > +} > + > +static int dmirror_do_read(struct dmirror *dmirror, unsigned long start, > + unsigned long end, struct dmirror_bounce *bounce) > +{ > + unsigned long pfn; > + void *ptr; > + > + ptr = bounce->ptr + ((start - bounce->addr) & PAGE_MASK); > + > + for (pfn = start >> PAGE_SHIFT; pfn < (end >> PAGE_SHIFT); pfn++) { > + void *entry; > + struct page *page; > + void *tmp; > + > + entry = xa_load(&dmirror->pt, pfn); > + page = xa_untag_pointer(entry); > + if (!page) > + return -ENOENT; > + > + tmp = kmap(page); > + memcpy(ptr, tmp, PAGE_SIZE); > + kunmap(page); > + > + ptr += PAGE_SIZE; > + bounce->cpages++; > + } > + > + return 0; > +} > + > +static int dmirror_read(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) > +{ > + struct dmirror_bounce bounce; > + unsigned long start, end; > + unsigned long size = cmd->npages << PAGE_SHIFT; > + int ret; > + > + start = cmd->addr; > + end = start + size; > + if (end < start) > + return -EINVAL; > + > + ret = dmirror_bounce_init(&bounce, start, size); > + if (ret) > + return ret; > + > +again: > + mutex_lock(&dmirror->mutex); > + ret = dmirror_do_read(dmirror, start, end, &bounce); > + mutex_unlock(&dmirror->mutex); > + if (ret == 0) > + ret = copy_to_user((void __user *)cmd->ptr, bounce.ptr, > + bounce.size); Use u64_to_user_ptr() instead of the cast > +static int dmirror_write(struct dmirror *dmirror, struct hmm_dmirror_cmd *cmd) > +{ > + struct dmirror_bounce bounce; > + unsigned long start, end; > + unsigned long size = cmd->npages << PAGE_SHIFT; > + int ret; > + > + start = cmd->addr; > + end = start + size; > + if (end < start) > + return -EINVAL; > + > + ret = dmirror_bounce_init(&bounce, start, size); > + if (ret) > + return ret; > + ret = copy_from_user(bounce.ptr, (void __user *)cmd->ptr, > + bounce.size); ditto > + if (ret) > + return ret; > + > +again: > + mutex_lock(&dmirror->mutex); > + ret = dmirror_do_write(dmirror, start, end, &bounce); > + mutex_unlock(&dmirror->mutex); > + if (ret == -ENOENT) { > + start = cmd->addr + (bounce.cpages << PAGE_SHIFT); > + ret = dmirror_fault(dmirror, start, end, true); > + if (ret == 0) { > + cmd->faults++; > + goto again; Use a loop instead of goto? Also I get this: lib/test_hmm.c: In function ‘dmirror_devmem_fault_alloc_and_copy’: lib/test_hmm.c:1041:25: warning: unused variable ‘vma’ [-Wunused-variable] 1041 | struct vm_area_struct *vma = args->vma; But this is a kernel bug, due to alloc_page_vma being a #define not a static inline and me having CONFIG_NUMA off in this .config Jason