> -----Original Message----- > From: Ritesh Harjani <riteshh@xxxxxxxxxxxxx> > Sent: Tuesday, March 23, 2021 11:48 PM > Subject: Re: [PATCH v3 02/10] fsdax: Factor helper: dax_fault_actor() > > > > On 3/19/21 7:22 AM, Shiyang Ruan wrote: > > The core logic in the two dax page fault functions is similar. So, > > move the logic into a common helper function. Also, to facilitate the > > addition of new features, such as CoW, switch-case is no longer used > > to handle different iomap types. > > > > Signed-off-by: Shiyang Ruan <ruansy.fnst@xxxxxxxxxxx> > > --- > > fs/dax.c | 291 +++++++++++++++++++++++++++---------------------------- > > 1 file changed, 145 insertions(+), 146 deletions(-) > > > > diff --git a/fs/dax.c b/fs/dax.c > > index 7031e4302b13..33ddad0f3091 100644 > > --- a/fs/dax.c > > +++ b/fs/dax.c > > @@ -1053,6 +1053,66 @@ static vm_fault_t dax_load_hole(struct xa_state > *xas, > > return ret; > > } > > > > +#ifdef CONFIG_FS_DAX_PMD > > +static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault > *vmf, > > + struct iomap *iomap, void **entry) > > +{ > > + struct address_space *mapping = vmf->vma->vm_file->f_mapping; > > + unsigned long pmd_addr = vmf->address & PMD_MASK; > > + struct vm_area_struct *vma = vmf->vma; > > + struct inode *inode = mapping->host; > > + pgtable_t pgtable = NULL; > > + struct page *zero_page; > > + spinlock_t *ptl; > > + pmd_t pmd_entry; > > + pfn_t pfn; > > + > > + zero_page = mm_get_huge_zero_page(vmf->vma->vm_mm); > > + > > + if (unlikely(!zero_page)) > > + goto fallback; > > + > > + pfn = page_to_pfn_t(zero_page); > > + *entry = dax_insert_entry(xas, mapping, vmf, *entry, pfn, > > + DAX_PMD | DAX_ZERO_PAGE, false); > > + > > + if (arch_needs_pgtable_deposit()) { > > + pgtable = pte_alloc_one(vma->vm_mm); > > + if (!pgtable) > > + return VM_FAULT_OOM; > > + } > > + > > + ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); > > + if (!pmd_none(*(vmf->pmd))) { > > + spin_unlock(ptl); > > + goto fallback; > > + } > > + > > + if (pgtable) { > > + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); > > + mm_inc_nr_ptes(vma->vm_mm); > > + } > > + pmd_entry = mk_pmd(zero_page, vmf->vma->vm_page_prot); > > + pmd_entry = pmd_mkhuge(pmd_entry); > > + set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); > > + spin_unlock(ptl); > > + trace_dax_pmd_load_hole(inode, vmf, zero_page, *entry); > > + return VM_FAULT_NOPAGE; > > + > > +fallback: > > + if (pgtable) > > + pte_free(vma->vm_mm, pgtable); > > + trace_dax_pmd_load_hole_fallback(inode, vmf, zero_page, *entry); > > + return VM_FAULT_FALLBACK; > > +} > > +#else > > +static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault > *vmf, > > + struct iomap *iomap, void **entry) > > +{ > > + return VM_FAULT_FALLBACK; > > +} > > +#endif /* CONFIG_FS_DAX_PMD */ > > + > > s64 dax_iomap_zero(loff_t pos, u64 length, struct iomap *iomap) > > { > > sector_t sector = iomap_sector(iomap, pos & PAGE_MASK); @@ > -1289,6 > > +1349,61 @@ static int dax_fault_cow_page(struct vm_fault *vmf, struct > iomap *iomap, > > return 0; > > } > > > > +/** > > + * dax_fault_actor - Common actor to handle pfn insertion in PTE/PMD fault. > > + * @vmf: vm fault instance > > + * @pfnp: pfn to be returned > > + * @xas: the dax mapping tree of a file > > + * @entry: an unlocked dax entry to be inserted > > + * @pmd: distinguish whether it is a pmd fault > > + * @flags: iomap flags > > + * @iomap: from iomap_begin() > > + * @srcmap: from iomap_begin(), not equal to iomap if it is a CoW > > + */ > > +static vm_fault_t dax_fault_actor(struct vm_fault *vmf, pfn_t *pfnp, > > + struct xa_state *xas, void *entry, bool pmd, unsigned int flags, > > + struct iomap *iomap, struct iomap *srcmap) { > > + struct address_space *mapping = vmf->vma->vm_file->f_mapping; > > + size_t size = pmd ? PMD_SIZE : PAGE_SIZE; > > + loff_t pos = (loff_t)xas->xa_offset << PAGE_SHIFT; > > shouldn't we use xa_index here for pos ? > (loff_t)xas->xa_index << PAGE_SHIFT; Yes. > > > + bool write = vmf->flags & FAULT_FLAG_WRITE; > > + bool sync = dax_fault_is_synchronous(flags, vmf->vma, iomap); > > + int err = 0; > > + pfn_t pfn; > > + > > + /* if we are reading UNWRITTEN and HOLE, return a hole. */ > > + if (!write && > > + (iomap->type == IOMAP_UNWRITTEN || iomap->type == > IOMAP_HOLE)) { > > + if (!pmd) > > + return dax_load_hole(xas, mapping, &entry, vmf); > > + else > > + return dax_pmd_load_hole(xas, vmf, iomap, &entry); > > + } > > + > > + if (iomap->type != IOMAP_MAPPED) { > > + WARN_ON_ONCE(1); > > + return VM_FAULT_SIGBUS; > > + } > > So now in case if mapping is not mapped, we always cause VM_FAULT_SIGBUG. > But earlier we were only doing WARN_ON_ONCE(1). > Can you pls help answer why the change in behavior? > The behavior in PTE fault was always check the error code by dax_fault_return(error) after WARN_ON_ONCE(1). So, I moved the dax_fault_return() into dax_fault_actor(). But I just found that, in PMD fault, it didn't do this check. So, I think I should move dax_fault_return() outside the dax_fault_actor() to keep the previous logic. > > > > > + > > + err = dax_iomap_pfn(iomap, pos, size, &pfn); > > + if (err) > > + return dax_fault_return(err); > > Same case here as well. This could return SIGBUS while earlier I am not sure > why were we only returning FALLBACK? > Yes. Thanks for pointing out. > > > + > > + entry = dax_insert_entry(xas, mapping, vmf, entry, pfn, 0, > > + write && !sync); > > In dax_insert_entry() we are passing 0 as flags. > We should be passing DAX_PMD/DAX_PTE no? > My mistake. > > > + > > + if (sync) > > + return dax_fault_synchronous_pfnp(pfnp, pfn); > > + > > > /* handle PMD case here */ > > + if (pmd) > > + return vmf_insert_pfn_pmd(vmf, pfn, write); > > /* handle PTE case here */ > > + if (write) > > + return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); > > + else > > + return vmf_insert_mixed(vmf->vma, vmf->address, pfn); > > +} > > It is easy to miss the return from if(pmd) case while reading. > A comment like above could be helpful for code review. > > > > + > > static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, > > int *iomap_errp, const struct iomap_ops *ops) > > { > > @@ -1296,17 +1411,14 @@ static vm_fault_t dax_iomap_pte_fault(struct > vm_fault *vmf, pfn_t *pfnp, > > struct address_space *mapping = vma->vm_file->f_mapping; > > XA_STATE(xas, &mapping->i_pages, vmf->pgoff); > > struct inode *inode = mapping->host; > > - unsigned long vaddr = vmf->address; > > loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; > > struct iomap iomap = { .type = IOMAP_HOLE }; > > struct iomap srcmap = { .type = IOMAP_HOLE }; > > unsigned flags = IOMAP_FAULT; > > int error, major = 0; > > bool write = vmf->flags & FAULT_FLAG_WRITE; > > - bool sync; > > vm_fault_t ret = 0; > > void *entry; > > - pfn_t pfn; > > > > trace_dax_pte_fault(inode, vmf, ret); > > /* > > @@ -1352,8 +1464,8 @@ static vm_fault_t dax_iomap_pte_fault(struct > vm_fault *vmf, pfn_t *pfnp, > > goto unlock_entry; > > } > > if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) > { > > - error = -EIO; /* fs corruption? */ > > - goto error_finish_iomap; > > + ret = VM_FAULT_SIGBUS; /* fs corruption? */ > > + goto finish_iomap; > > } > > > > if (vmf->cow_page) { > > @@ -1363,49 +1475,19 @@ static vm_fault_t dax_iomap_pte_fault(struct > vm_fault *vmf, pfn_t *pfnp, > > goto finish_iomap; > > } > > > > - sync = dax_fault_is_synchronous(flags, vma, &iomap); > > - > > - switch (iomap.type) { > > - case IOMAP_MAPPED: > > - if (iomap.flags & IOMAP_F_NEW) { > > - count_vm_event(PGMAJFAULT); > > - count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); > > - major = VM_FAULT_MAJOR; > > - } > > - error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn); > > - if (error < 0) > > - goto error_finish_iomap; > > - > > - entry = dax_insert_entry(&xas, mapping, vmf, entry, pfn, > > - 0, write && !sync); > > - > > - if (sync) { > > - ret = dax_fault_synchronous_pfnp(pfnp, pfn); > > - goto finish_iomap; > > - } > > - trace_dax_insert_mapping(inode, vmf, entry); > > - if (write) > > - ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn); > > - else > > - ret = vmf_insert_mixed(vma, vaddr, pfn); > > - > > + ret = dax_fault_actor(vmf, pfnp, &xas, entry, false, flags, > > + &iomap, &srcmap); > > + if (ret == VM_FAULT_SIGBUS) > > goto finish_iomap; > > - case IOMAP_UNWRITTEN: > > - case IOMAP_HOLE: > > - if (!write) { > > - ret = dax_load_hole(&xas, mapping, &entry, vmf); > > - goto finish_iomap; > > - } > > - fallthrough; > > - default: > > - WARN_ON_ONCE(1); > > - error = -EIO; > > - break; > > + > > + /* read/write MAPPED, CoW UNWRITTEN */ > > + if (iomap.flags & IOMAP_F_NEW) { > > + count_vm_event(PGMAJFAULT); > > + count_memcg_event_mm(vma->vm_mm, PGMAJFAULT); > > + major = VM_FAULT_MAJOR; > > } > > It is much better if above accounting is also done in dax_fault_actor() > function itself. Then at the end of this function we need to just do > "return ret" instead of "return ret | major" > Yes. -- Thanks, Ruan Shiyang. > > > > > - error_finish_iomap: > > - ret = dax_fault_return(error); > > - finish_iomap: > > +finish_iomap: > > if (ops->iomap_end) { > > int copied = PAGE_SIZE; > > > > @@ -1419,66 +1501,14 @@ static vm_fault_t dax_iomap_pte_fault(struct > vm_fault *vmf, pfn_t *pfnp, > > */ > > ops->iomap_end(inode, pos, PAGE_SIZE, copied, flags, &iomap); > > } > > - unlock_entry: > > +unlock_entry: > > dax_unlock_entry(&xas, entry); > > - out: > > +out: > > trace_dax_pte_fault_done(inode, vmf, ret); > > return ret | major; > > } > > > -ritesh