Re: [PATCH v2 2/3] /dev/dax, core: file operations and dax-mmap

Dan Williams <dan.j.williams@xxxxxxxxx> · Tue, 17 May 2016 15:19:58 -0700

On Tue, May 17, 2016 at 3:57 AM, Johannes Thumshirn <jthumshirn@xxxxxxx> wrote:
> On Sat, May 14, 2016 at 11:26:29PM -0700, Dan Williams wrote:
>> The "Device DAX" core enables dax mappings of performance / feature
>> differentiated memory.  An open mapping or file handle keeps the backing
>> struct device live, but new mappings are only possible while the device
>> is enabled.   Faults are handled under rcu_read_lock to synchronize
>> with the enabled state of the device.
>>
>> Similar to the filesystem-dax case the backing memory may optionally
>> have struct page entries.  However, unlike fs-dax there is no support
>> for private mappings, or mappings that are not backed by media (see
>> use of zero-page in fs-dax).
>>
>> Mappings are always guaranteed to match the alignment of the dax_region.
>> If the dax_region is configured to have a 2MB alignment, all mappings
>> are guaranteed to be backed by a pmd entry.  Contrast this determinism
>> with the fs-dax case where pmd mappings are opportunistic.  If userspace
>> attempts to force a misaligned mapping, the driver will fail the mmap
>> attempt.  See dax_dev_check_vma() for other scenarios that are rejected,
>> like MAP_PRIVATE mappings.
>>
>> Cc: Jeff Moyer <jmoyer@xxxxxxxxxx>
>> Cc: Christoph Hellwig <hch@xxxxxx>
>> Cc: Andrew Morton <akpm@xxxxxxxxxxxxxxxxxxxx>
>> Cc: Dave Hansen <dave.hansen@xxxxxxxxxxxxxxx>
>> Cc: Ross Zwisler <ross.zwisler@xxxxxxxxxxxxxxx>
>> Signed-off-by: Dan Williams <dan.j.williams@xxxxxxxxx>
>> ---
>>  drivers/dax/Kconfig |    1
>>  drivers/dax/dax.c   |  316 +++++++++++++++++++++++++++++++++++++++++++++++++++
>>  mm/huge_memory.c    |    1
>>  mm/hugetlb.c        |    1
>>  4 files changed, 319 insertions(+)
>>
>> diff --git a/drivers/dax/Kconfig b/drivers/dax/Kconfig
>> index 86ffbaa891ad..cedab7572de3 100644
>> --- a/drivers/dax/Kconfig
>> +++ b/drivers/dax/Kconfig
>> @@ -1,6 +1,7 @@
>>  menuconfig DEV_DAX
>>       tristate "DAX: direct access to differentiated memory"
>>       default m if NVDIMM_DAX
>> +     depends on TRANSPARENT_HUGEPAGE
>>       help
>>         Support raw access to differentiated (persistence, bandwidth,
>>         latency...) memory via an mmap(2) capable character
>> diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c
>> index 8207fb33a992..b2fe8a0ce866 100644
>> --- a/drivers/dax/dax.c
>> +++ b/drivers/dax/dax.c
>> @@ -49,6 +49,7 @@ struct dax_region {
>>   * @region - parent region
>>   * @dev - device backing the character device
>>   * @kref - enable this data to be tracked in filp->private_data
>> + * @alive - !alive + rcu grace period == no new mappings can be established
>>   * @id - child id in the region
>>   * @num_resources - number of physical address extents in this device
>>   * @res - array of physical address ranges
>> @@ -57,6 +58,7 @@ struct dax_dev {
>>       struct dax_region *region;
>>       struct device *dev;
>>       struct kref kref;
>> +     bool alive;
>>       int id;
>>       int num_resources;
>>       struct resource res[0];
>> @@ -150,6 +152,10 @@ static void destroy_dax_dev(void *_dev)
>>
>>       dev_dbg(dev, "%s\n", __func__);
>>
>> +     /* disable and flush fault handlers, TODO unmap inodes */
>> +     dax_dev->alive = false;
>> +     synchronize_rcu();
>> +
>
> IIRC RCU is only protecting a pointer, not the content of the pointer, so this
> looks wrong to me.

The driver is using RCU to guarantee that all currently running fault
handlers have either completed or will see the new state of ->alive
when they start.  Reference counts are protecting the actual dax_dev
object.

>>       get_device(dev);
>>       device_unregister(dev);
>>       ida_simple_remove(&dax_region->ida, dax_dev->id);
>> @@ -173,6 +179,7 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
>>       memcpy(dax_dev->res, res, sizeof(*res) * count);
>>       dax_dev->num_resources = count;
>>       kref_init(&dax_dev->kref);
>> +     dax_dev->alive = true;
>>       dax_dev->region = dax_region;
>>       kref_get(&dax_region->kref);
>>
>> @@ -217,9 +224,318 @@ int devm_create_dax_dev(struct dax_region *dax_region, struct resource *res,
>>  }
>>  EXPORT_SYMBOL_GPL(devm_create_dax_dev);
>>
>> +/* return an unmapped area aligned to the dax region specified alignment */
>> +static unsigned long dax_dev_get_unmapped_area(struct file *filp,
>> +             unsigned long addr, unsigned long len, unsigned long pgoff,
>> +             unsigned long flags)
>> +{
>> +     unsigned long off, off_end, off_align, len_align, addr_align, align;
>> +     struct dax_dev *dax_dev = filp ? filp->private_data : NULL;
>> +     struct dax_region *dax_region;
>> +
>> +     if (!dax_dev || addr)
>> +             goto out;
>> +
>> +     dax_region = dax_dev->region;
>> +     align = dax_region->align;
>> +     off = pgoff << PAGE_SHIFT;
>> +     off_end = off + len;
>> +     off_align = round_up(off, align);
>> +
>> +     if ((off_end <= off_align) || ((off_end - off_align) < align))
>> +             goto out;
>> +
>> +     len_align = len + align;
>> +     if ((off + len_align) < off)
>> +             goto out;
>> +
>> +     addr_align = current->mm->get_unmapped_area(filp, addr, len_align,
>> +                     pgoff, flags);
>> +     if (!IS_ERR_VALUE(addr_align)) {
>> +             addr_align += (off - addr_align) & (align - 1);
>> +             return addr_align;
>> +     }
>> + out:
>> +     return current->mm->get_unmapped_area(filp, addr, len, pgoff, flags);
>> +}
>> +
>> +static int __match_devt(struct device *dev, const void *data)
>> +{
>> +     const dev_t *devt = data;
>> +
>> +     return dev->devt == *devt;
>> +}
>> +
>> +static struct device *dax_dev_find(dev_t dev_t)
>> +{
>> +     return class_find_device(dax_class, NULL, &dev_t, __match_devt);
>> +}
>> +
>> +static int dax_dev_open(struct inode *inode, struct file *filp)
>> +{
>> +     struct dax_dev *dax_dev = NULL;
>> +     struct device *dev;
>> +
>> +     dev = dax_dev_find(inode->i_rdev);
>> +     if (!dev)
>> +             return -ENXIO;
>> +
>> +     device_lock(dev);
>> +     dax_dev = dev_get_drvdata(dev);
>> +     if (dax_dev) {
>> +             dev_dbg(dev, "%s\n", __func__);
>> +             filp->private_data = dax_dev;
>> +             kref_get(&dax_dev->kref);
>> +             inode->i_flags = S_DAX;
>> +     }
>> +     device_unlock(dev);
>> +
>
> Does this block really need to be protected by the dev mutex? If yes, have you
> considered re-ordering it like this?

We need to make sure the dax_dev is not freed while we're trying to
take a reference against it, hence the lock.

>
>         device_lock(dev);
>         dax_dev = dev_get_drvdata(dev);
>         if (!dax_dev) {
>                 device_unlock(dev);
>                 goto out_put;
>         }
>         filp->private_data = dax_dev;
>         kref_get(&dax_dev->kref); // or get_dax_device(dax_dev)

dax_dev may already be invalid at this point if open() is racing
device_unregister().

>         inode->i_flags = S_DAX;
>         device_unlock(dev);
>
>         return 0;
>
> out_put:
>         put_device(dev);
>         return -ENXIO;
>
> The only thing I see that could be needed to be protected here, is the
> inode->i_flags and shouldn't that be protected by the inode->i_mutex? But I'm
> not sure, hence the question.

Nothing else should be mutating the inode flags so I don't think we
need protection.

> Also S_DAX is the only valid flag for a DAX
> device, isn't it?

Correct, we need that capability so that vma_is_dax() will recognize
these mappings.

>
>> +     if (!dax_dev) {
>> +             put_device(dev);
>> +             return -ENXIO;
>> +     }
>> +     return 0;
>> +}
>> +
>> +static int dax_dev_release(struct inode *inode, struct file *filp)
>> +{
>> +     struct dax_dev *dax_dev = filp->private_data;
>> +     struct device *dev = dax_dev->dev;
>> +
>> +     dev_dbg(dax_dev->dev, "%s\n", __func__);
>> +     dax_dev_put(dax_dev);
>> +     put_device(dev);
>> +
>
> For reasons of consistency one could reset the inode's i_flags here.

That change seems like a bug to me, it doesn't stop being a dax inode
just because an open file is released, right?

>> +     return 0;
>> +}
>> +
>> +static int check_vma(struct dax_dev *dax_dev, struct vm_area_struct *vma,
>> +             const char *func)
>> +{
>> +     struct dax_region *dax_region = dax_dev->region;
>> +     struct device *dev = dax_dev->dev;
>> +     unsigned long mask;
>> +
>> +     if (!dax_dev->alive)
>> +             return -ENXIO;
>> +
>> +     /* prevent private / writable mappings from being established */
>> +     if ((vma->vm_flags & (VM_NORESERVE|VM_SHARED|VM_WRITE)) == VM_WRITE) {
>> +             dev_dbg(dev, "%s: %s: fail, attempted private mapping\n",
>> +                             current->comm, func);
>
> This deserves a higher log-level than debug, IMHO.
>
>> +             return -EINVAL;
>> +     }
>> +
>> +     mask = dax_region->align - 1;
>> +     if (vma->vm_start & mask || vma->vm_end & mask) {
>> +             dev_dbg(dev, "%s: %s: fail, unaligned vma (%#lx - %#lx, %#lx)\n",
>> +                             current->comm, func, vma->vm_start, vma->vm_end,
>> +                             mask);
>
> Ditto.
>
>> +             return -EINVAL;
>> +     }
>> +
>> +     if ((dax_region->pfn_flags & (PFN_DEV|PFN_MAP)) == PFN_DEV
>> +                     && (vma->vm_flags & VM_DONTCOPY) == 0) {
>> +             dev_dbg(dev, "%s: %s: fail, dax range requires MADV_DONTFORK\n",
>> +                             current->comm, func);
>
> Ditto.
>
>> +             return -EINVAL;
>> +     }
>> +
>> +     if (!vma_is_dax(vma)) {
>> +             dev_dbg(dev, "%s: %s: fail, vma is not DAX capable\n",
>> +                             current->comm, func);
>
> Ditto.

Ok, will bump these up to dev_info.

>> +             return -EINVAL;
>> +     }
>> +
>> +     return 0;
>> +}
>> +
>> +static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff,
>> +             unsigned long size)
>> +{
>> +     struct resource *res;
>> +     phys_addr_t phys;
>> +     int i;
>> +
>> +     for (i = 0; i < dax_dev->num_resources; i++) {
>> +             res = &dax_dev->res[i];
>> +             phys = pgoff * PAGE_SIZE + res->start;
>> +             if (phys >= res->start && phys <= res->end)
>> +                     break;
>> +             pgoff -= PHYS_PFN(resource_size(res));
>> +     }
>> +
>> +     if (i < dax_dev->num_resources) {
>> +             res = &dax_dev->res[i];
>> +             if (phys + size - 1 <= res->end)
>> +                     return phys;
>> +     }
>> +
>> +     return -1;
>> +}
>> +
>> +static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma,
>> +             struct vm_fault *vmf)
>> +{
>> +     unsigned long vaddr = (unsigned long) vmf->virtual_address;
>> +     struct device *dev = dax_dev->dev;
>> +     struct dax_region *dax_region;
>> +     int rc = VM_FAULT_SIGBUS;
>> +     phys_addr_t phys;
>> +     pfn_t pfn;
>> +
>> +     if (check_vma(dax_dev, vma, __func__))
>> +             return VM_FAULT_SIGBUS;
>> +
>> +     dax_region = dax_dev->region;
>> +     if (dax_region->align > PAGE_SIZE) {
>> +             dev_dbg(dev, "%s: alignment > fault size\n", __func__);
>> +             return VM_FAULT_SIGBUS;
>> +     }
>> +
>> +     phys = pgoff_to_phys(dax_dev, vmf->pgoff, PAGE_SIZE);
>> +     if (phys == -1) {
>> +             dev_dbg(dev, "%s: phys_to_pgoff(%#lx) failed\n", __func__,
>> +                             vmf->pgoff);
>> +             return VM_FAULT_SIGBUS;
>> +     }
>> +
>> +     pfn = phys_to_pfn_t(phys, dax_region->pfn_flags);
>> +
>> +     rc = vm_insert_mixed(vma, vaddr, pfn);
>> +
>> +     if (rc == -ENOMEM)
>> +             return VM_FAULT_OOM;
>> +     if (rc < 0 && rc != -EBUSY)
>> +             return VM_FAULT_SIGBUS;
>> +
>> +     return VM_FAULT_NOPAGE;
>> +}
>> +
>> +static int dax_dev_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
>> +{
>> +     int rc;
>> +     struct file *filp = vma->vm_file;
>> +     struct dax_dev *dax_dev = filp->private_data;
>> +
>> +     dev_dbg(dax_dev->dev, "%s: %s: %s (%#lx - %#lx)\n", __func__,
>> +                     current->comm, (vmf->flags & FAULT_FLAG_WRITE)
>> +                     ? "write" : "read", vma->vm_start, vma->vm_end);
>> +     rcu_read_lock();
>> +     rc = __dax_dev_fault(dax_dev, vma, vmf);
>> +     rcu_read_unlock();
>
> Similarly, what are you protecting? I just see you're locking something to be
> read, but don't do a rcu_dereference() to actually access a rcu protected
> pointer. Or am I missing something totally here?

Like I mentioned above, I'm protecting the fault handler vs shutdown.
I need this building-block-guarantee later when I go to fix the vfs to
unmap inodes on device-removal.  Same bug currently in filesystem-DAX
[1].

[1]: http://thread.gmane.org/gmane.linux.file-systems/103333/focus=72179

[..]

Thanks for the review Johannes.
--
To unsubscribe from this list: send the line "unsubscribe linux-block" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html