Re: [Patch 4/4] Support for obtaining reduced view of a graded file

Randy Dunlap <rdunlap@xxxxxxxxxxxxx> · Fri, 6 Apr 2018 10:34:19 -0700

On 04/06/2018 04:42 AM, Sayan Ghosh wrote:
> The patch is on top of Linux Kernel 4.7.2.
> 
> Signed-off-by: Sayan Ghosh <sgdgp.2014@xxxxxxxxx>
> ---
>  fs/dax.c       | 139 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/ext4/ext4.h |   1 +
>  fs/ext4/file.c |  79 +++++++++++++++++++++++++-------
>  3 files changed, 203 insertions(+), 16 deletions(-)
> 
> diff --git a/fs/dax.c b/fs/dax.c
> index e207f8f..1930307 100755
> --- a/fs/dax.c
> +++ b/fs/dax.c
> @@ -793,6 +793,41 @@ int dax_writeback_mapping_range(struct
> address_space *mapping,
>  }
>  EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
> 
> +/*
> + * This function is a copy of dax_insert_mapping.
> + * It is called in skip_dax_fault_handler.
> + */
> +static int skip_dax_insert_mapping(struct address_space *mapping,
> +            struct buffer_head *bh, void **entryp,
> +            struct vm_area_struct *vma, struct vm_fault *vmf, sector_t blknum)
> +{
> +    unsigned long vaddr = (unsigned long)vmf->virtual_address;
> +    struct inode *inode = mapping->host;
> +    struct block_device *bdev = bh->b_bdev;
> +    bdev->bd_inode->i_ino=mapping->host->i_ino;
> +    struct blk_dax_ctl dax = {
> +        .sector = to_sector(bh, mapping->host),
> +        .size = bh->b_size,
> +    };
> +    int error;
> +    sector_t block;
> +    void *ret;
> +    void *entry = *entryp;
> +    block = (sector_t)vmf->pgoff << (PAGE_SHIFT - mapping->host->i_blkbits);
> +    dax.sector = blknum << (mapping->host->i_blkbits - 9);
> +    if (dax_map_atomic(bdev, &dax) < 0){
> +        return PTR_ERR(dax.addr);
> +    }

Indentation size.

Use tabs instead of spaces.

> +    dax_unmap_atomic(bdev, &dax);
> +    ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector);
> +    if (IS_ERR(ret)){
> +        return PTR_ERR(ret);
> +    }
> +    *entryp = ret;
> +
> +    vm_insert_mixed(vma, vaddr, dax.pfn);
> +}
> +
>  static int dax_insert_mapping(struct address_space *mapping,
>              struct buffer_head *bh, void **entryp,
>              struct vm_area_struct *vma, struct vm_fault *vmf)
> @@ -915,6 +950,110 @@ int __dax_fault(struct vm_area_struct *vma,
> struct vm_fault *vmf,
>  }
>  EXPORT_SYMBOL(__dax_fault);
> 
> +/*
> + * This is the modified __dax_fault handler.
> + * Most of the code is copied from __dax_fault function.
> + * One more parameter is passed here, namely skip_dax.
> + */
> +int __skip_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf,
> +            get_block_t get_block,long skip_dax)
> +{
> +    struct file *file = vma->vm_file;
> +    struct address_space *mapping = file->f_mapping;
> +    struct inode *inode = mapping->host;
> +    void *entry;
> +    struct buffer_head bh;
> +    unsigned long vaddr = (unsigned long)vmf->virtual_address;
> +    unsigned blkbits = inode->i_blkbits;
> +    sector_t block;
> +    sector_t corrected_sector,corrected_new_block;
> +    pgoff_t size;
> +    int error;
> +    int new_error;
> +    int major = 0;
> +
> +    /*
> +     * Check whether offset isn't beyond end of file now. Caller is supposed
> +     * to hold locks serializing us with truncate / punch hole so this is
> +     * a reliable test.
> +     */
> +    size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
> +    if (vmf->pgoff >= size)
> +        return VM_FAULT_SIGBUS;
> +
> +    memset(&bh, 0, sizeof(bh));
> +    block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits);
> +    bh.b_bdev = inode->i_sb->s_bdev;
> +    bh.b_size = PAGE_SIZE;
> +
> +    entry = grab_mapping_entry(mapping, vmf->pgoff);
> +    if (IS_ERR(entry)) {
> +        error = PTR_ERR(entry);
> +        goto out;
> +    }
> +
> +    error = get_block(inode, block, &bh, 0);
> +    if (!error && (bh.b_size < PAGE_SIZE))
> +        error = -EIO;        /* fs corruption? */
> +    if (error){
> +        goto unlock_entry;
> +    }
> +
> +    if (vmf->cow_page) {
> +        struct page *new_page = vmf->cow_page;
> +        if (buffer_written(&bh))
> +            error = copy_user_bh(new_page, inode, &bh, vaddr);
> +        else
> +            clear_user_highpage(new_page, vaddr);
> +        if (error){
> +            goto unlock_entry;
> +        }
> +        if (!radix_tree_exceptional_entry(entry)) {
> +            vmf->page = entry;
> +            return VM_FAULT_LOCKED;
> +        }
> +        vmf->entry = entry;
> +        return VM_FAULT_DAX_LOCKED;
> +    }
> +
> +    if (!buffer_mapped(&bh)) {
> +        if (vmf->flags & FAULT_FLAG_WRITE) {
> +            error = get_block(inode, block, &bh, 1);
> +            count_vm_event(PGMAJFAULT);
> +            mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
> +            major = VM_FAULT_MAJOR;
> +            if (!error && (bh.b_size < PAGE_SIZE))
> +                error = -EIO;
> +            if (error)
> +                goto unlock_entry;
> +        } else {
> +            goto out2;
> +        }
> +    }
> +
> +    /* Filesystem should not return unwritten buffers to us! */
> +    WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh));
> +out2:
> +    /* We take the new block here, the next higher
> +     * graded block
> +     */
> +    corrected_sector = skip_dax;
> +    new_error = get_block(inode, corrected_sector, &bh, 0);
> +    corrected_new_block = bh.b_blocknr;
> +    error = get_block(inode, block, &bh, 0);
> +    error = skip_dax_insert_mapping(mapping, &bh, &entry, vma, vmf,
> corrected_new_block);
> + unlock_entry:
> +    put_locked_mapping_entry(mapping, vmf->pgoff, entry);
> + out:
> +    if (error == -ENOMEM)
> +        return VM_FAULT_OOM | major;
> +    /* -EBUSY is fine, somebody else faulted on the same PTE */
> +    if ((error < 0) && (error != -EBUSY))
> +        return VM_FAULT_SIGBUS | major;
> +    return VM_FAULT_NOPAGE | major;
> +}
> +EXPORT_SYMBOL(__skip_dax_fault);
> +
>  /**
>   * dax_fault - handle a page fault on a DAX file
>   * @vma: The virtual memory area where the fault occurred
> diff --git a/fs/ext4/file.c b/fs/ext4/file.c
> index 368cf53..5dafd52 100755
> --- a/fs/ext4/file.c
> +++ b/fs/ext4/file.c
> @@ -32,6 +32,20 @@
>  #include "acl.h"
> 
>  /*
> + * read_high() returns 0 or 1 depending whether we want to read all the file
> + * blocks or only high graded, respectively.
> + * It gets this information from the extended attribute set by user beforehand.
> + */
> +int read_high(struct inode *inode)
> +{
> +    const char *xattr_name = "read_high";
> +    int read_high = 0;
> +    int xattr_size = sizeof(int);
> +    xattr_size = ext4_xattr_get(inode,
> EXT4_XATTR_INDEX_USER,xattr_name, (void *)&read_high,xattr_size);

line too long.

> +    return read_high;
> +}
> +
> +/*
>   * Called when an inode is released. Note that this is different
>   * from ext4_file_open: open gets called at every open, but release
>   * gets called only when /all/ the files are closed.
> @@ -349,22 +363,55 @@ static int graded_ext4_fault(struct
> vm_area_struct *vma, struct vm_fault *vmf){
>              }
>          }
>          else{
> -            /*
> -             * Here the higher graded blocks are redirected via DAX path
> -             * since we consider Persistent Memory as higher tier.
> -             *
> -             * ** TODO **
> -             * To take care of the case when the higher tier is not
> -             * persistent memory (can be HDD-SSD combination), a check
> -             * of the same needs to be provided before re-direction.
> -             */
> -            unsigned long long temp;
> -            if(find_grade(grade_array,total,block,&temp) == 1){
> -                result = __dax_fault(vma, vmf, ext4_dax_get_block);
> -            }
> -            else if(find_grade(grade_array,total,block,&temp) == 0){
> -                result = ext4_filemap_fault(vma,vmf);
> -            }
> +            /*
> +             * If read_high is enabled then read the higher
> +             * grade blocks only.
> +             * It uses a modified dax_fault handler with
> +             * the assumption that high grade blocks are
> +             * in Persistent Memory.
> +             *
> +             * ** TODO 1**
> +             * To take care when high grade blocks are allocated elsewhere.
> +             * Checking of allocated space of each high graded block needs
> +             * to be done.
> +             *
> +             * ** TODO 2**
> +             * Modifying vmf according to the target_block in order to
> +             * use the existing dax_fault handler needs to be done.
> +             */
> +            if(read_high(inode) == 1)
> +            {

		if (read_high(inode) == 1) {

> +                ext4_lblk_t target_block;
> +                if(block >= total)

		if (block >= total) {

> +                {
> +                    goto out;
> +                }
> +                else{
> +                    target_block = block;
> +                    goto pm_fault_handler;
> +                }
> +            pm_fault_handler:
> +                result = __skip_dax_fault(vma, vmf,
> ext4_dax_get_block,target_block);
> +            }
> +            else
> +            {
> +                /*
> +                 * Here the higher graded blocks are redirected via DAX path
> +                 * since we consider Persistent Memory as higher tier.
> +                 *
> +                 * ** TODO **
> +                 * To take care of the case when the higher tier is not
> +                 * persistent memory (can be HDD-SSD combination), a check
> +                 * of the same needs to be provided before re-direction.
> +                 */
> +                unsigned long long temp;
> +                if(find_grade(grade_array,total,block,&temp) == 1){> +                    result = __dax_fault(vma, vmf, ext4_dax_get_block);
> +                }
> +                else if(find_grade(grade_array,total,block,&temp) == 0){
> +                    result = ext4_filemap_fault(vma,vmf);
> +                }
> +            }
>          }
>      }
>   out:
> ‌
> 

-- 
~Randy