On 04/06/2018 04:42 AM, Sayan Ghosh wrote: > The patch is on top of Linux Kernel 4.7.2. > > Signed-off-by: Sayan Ghosh <sgdgp.2014@xxxxxxxxx> > --- > fs/dax.c | 139 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ > fs/ext4/ext4.h | 1 + > fs/ext4/file.c | 79 +++++++++++++++++++++++++------- > 3 files changed, 203 insertions(+), 16 deletions(-) > > diff --git a/fs/dax.c b/fs/dax.c > index e207f8f..1930307 100755 > --- a/fs/dax.c > +++ b/fs/dax.c > @@ -793,6 +793,41 @@ int dax_writeback_mapping_range(struct > address_space *mapping, > } > EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); > > +/* > + * This function is a copy of dax_insert_mapping. > + * It is called in skip_dax_fault_handler. > + */ > +static int skip_dax_insert_mapping(struct address_space *mapping, > + struct buffer_head *bh, void **entryp, > + struct vm_area_struct *vma, struct vm_fault *vmf, sector_t blknum) > +{ > + unsigned long vaddr = (unsigned long)vmf->virtual_address; > + struct inode *inode = mapping->host; > + struct block_device *bdev = bh->b_bdev; > + bdev->bd_inode->i_ino=mapping->host->i_ino; > + struct blk_dax_ctl dax = { > + .sector = to_sector(bh, mapping->host), > + .size = bh->b_size, > + }; > + int error; > + sector_t block; > + void *ret; > + void *entry = *entryp; > + block = (sector_t)vmf->pgoff << (PAGE_SHIFT - mapping->host->i_blkbits); > + dax.sector = blknum << (mapping->host->i_blkbits - 9); > + if (dax_map_atomic(bdev, &dax) < 0){ > + return PTR_ERR(dax.addr); > + } Indentation size. Use tabs instead of spaces. > + dax_unmap_atomic(bdev, &dax); > + ret = dax_insert_mapping_entry(mapping, vmf, entry, dax.sector); > + if (IS_ERR(ret)){ > + return PTR_ERR(ret); > + } > + *entryp = ret; > + > + vm_insert_mixed(vma, vaddr, dax.pfn); > +} > + > static int dax_insert_mapping(struct address_space *mapping, > struct buffer_head *bh, void **entryp, > struct vm_area_struct *vma, struct vm_fault *vmf) > @@ -915,6 +950,110 @@ int __dax_fault(struct vm_area_struct *vma, > struct vm_fault *vmf, > } > EXPORT_SYMBOL(__dax_fault); > > +/* > + * This is the modified __dax_fault handler. > + * Most of the code is copied from __dax_fault function. > + * One more parameter is passed here, namely skip_dax. > + */ > +int __skip_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, > + get_block_t get_block,long skip_dax) > +{ > + struct file *file = vma->vm_file; > + struct address_space *mapping = file->f_mapping; > + struct inode *inode = mapping->host; > + void *entry; > + struct buffer_head bh; > + unsigned long vaddr = (unsigned long)vmf->virtual_address; > + unsigned blkbits = inode->i_blkbits; > + sector_t block; > + sector_t corrected_sector,corrected_new_block; > + pgoff_t size; > + int error; > + int new_error; > + int major = 0; > + > + /* > + * Check whether offset isn't beyond end of file now. Caller is supposed > + * to hold locks serializing us with truncate / punch hole so this is > + * a reliable test. > + */ > + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; > + if (vmf->pgoff >= size) > + return VM_FAULT_SIGBUS; > + > + memset(&bh, 0, sizeof(bh)); > + block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); > + bh.b_bdev = inode->i_sb->s_bdev; > + bh.b_size = PAGE_SIZE; > + > + entry = grab_mapping_entry(mapping, vmf->pgoff); > + if (IS_ERR(entry)) { > + error = PTR_ERR(entry); > + goto out; > + } > + > + error = get_block(inode, block, &bh, 0); > + if (!error && (bh.b_size < PAGE_SIZE)) > + error = -EIO; /* fs corruption? */ > + if (error){ > + goto unlock_entry; > + } > + > + if (vmf->cow_page) { > + struct page *new_page = vmf->cow_page; > + if (buffer_written(&bh)) > + error = copy_user_bh(new_page, inode, &bh, vaddr); > + else > + clear_user_highpage(new_page, vaddr); > + if (error){ > + goto unlock_entry; > + } > + if (!radix_tree_exceptional_entry(entry)) { > + vmf->page = entry; > + return VM_FAULT_LOCKED; > + } > + vmf->entry = entry; > + return VM_FAULT_DAX_LOCKED; > + } > + > + if (!buffer_mapped(&bh)) { > + if (vmf->flags & FAULT_FLAG_WRITE) { > + error = get_block(inode, block, &bh, 1); > + count_vm_event(PGMAJFAULT); > + mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); > + major = VM_FAULT_MAJOR; > + if (!error && (bh.b_size < PAGE_SIZE)) > + error = -EIO; > + if (error) > + goto unlock_entry; > + } else { > + goto out2; > + } > + } > + > + /* Filesystem should not return unwritten buffers to us! */ > + WARN_ON_ONCE(buffer_unwritten(&bh) || buffer_new(&bh)); > +out2: > + /* We take the new block here, the next higher > + * graded block > + */ > + corrected_sector = skip_dax; > + new_error = get_block(inode, corrected_sector, &bh, 0); > + corrected_new_block = bh.b_blocknr; > + error = get_block(inode, block, &bh, 0); > + error = skip_dax_insert_mapping(mapping, &bh, &entry, vma, vmf, > corrected_new_block); > + unlock_entry: > + put_locked_mapping_entry(mapping, vmf->pgoff, entry); > + out: > + if (error == -ENOMEM) > + return VM_FAULT_OOM | major; > + /* -EBUSY is fine, somebody else faulted on the same PTE */ > + if ((error < 0) && (error != -EBUSY)) > + return VM_FAULT_SIGBUS | major; > + return VM_FAULT_NOPAGE | major; > +} > +EXPORT_SYMBOL(__skip_dax_fault); > + > /** > * dax_fault - handle a page fault on a DAX file > * @vma: The virtual memory area where the fault occurred > diff --git a/fs/ext4/file.c b/fs/ext4/file.c > index 368cf53..5dafd52 100755 > --- a/fs/ext4/file.c > +++ b/fs/ext4/file.c > @@ -32,6 +32,20 @@ > #include "acl.h" > > /* > + * read_high() returns 0 or 1 depending whether we want to read all the file > + * blocks or only high graded, respectively. > + * It gets this information from the extended attribute set by user beforehand. > + */ > +int read_high(struct inode *inode) > +{ > + const char *xattr_name = "read_high"; > + int read_high = 0; > + int xattr_size = sizeof(int); > + xattr_size = ext4_xattr_get(inode, > EXT4_XATTR_INDEX_USER,xattr_name, (void *)&read_high,xattr_size); line too long. > + return read_high; > +} > + > +/* > * Called when an inode is released. Note that this is different > * from ext4_file_open: open gets called at every open, but release > * gets called only when /all/ the files are closed. > @@ -349,22 +363,55 @@ static int graded_ext4_fault(struct > vm_area_struct *vma, struct vm_fault *vmf){ > } > } > else{ > - /* > - * Here the higher graded blocks are redirected via DAX path > - * since we consider Persistent Memory as higher tier. > - * > - * ** TODO ** > - * To take care of the case when the higher tier is not > - * persistent memory (can be HDD-SSD combination), a check > - * of the same needs to be provided before re-direction. > - */ > - unsigned long long temp; > - if(find_grade(grade_array,total,block,&temp) == 1){ > - result = __dax_fault(vma, vmf, ext4_dax_get_block); > - } > - else if(find_grade(grade_array,total,block,&temp) == 0){ > - result = ext4_filemap_fault(vma,vmf); > - } > + /* > + * If read_high is enabled then read the higher > + * grade blocks only. > + * It uses a modified dax_fault handler with > + * the assumption that high grade blocks are > + * in Persistent Memory. > + * > + * ** TODO 1** > + * To take care when high grade blocks are allocated elsewhere. > + * Checking of allocated space of each high graded block needs > + * to be done. > + * > + * ** TODO 2** > + * Modifying vmf according to the target_block in order to > + * use the existing dax_fault handler needs to be done. > + */ > + if(read_high(inode) == 1) > + { if (read_high(inode) == 1) { > + ext4_lblk_t target_block; > + if(block >= total) if (block >= total) { > + { > + goto out; > + } > + else{ > + target_block = block; > + goto pm_fault_handler; > + } > + pm_fault_handler: > + result = __skip_dax_fault(vma, vmf, > ext4_dax_get_block,target_block); > + } > + else > + { > + /* > + * Here the higher graded blocks are redirected via DAX path > + * since we consider Persistent Memory as higher tier. > + * > + * ** TODO ** > + * To take care of the case when the higher tier is not > + * persistent memory (can be HDD-SSD combination), a check > + * of the same needs to be provided before re-direction. > + */ > + unsigned long long temp; > + if(find_grade(grade_array,total,block,&temp) == 1){> + result = __dax_fault(vma, vmf, ext4_dax_get_block); > + } > + else if(find_grade(grade_array,total,block,&temp) == 0){ > + result = ext4_filemap_fault(vma,vmf); > + } > + } > } > } > out: > > -- ~Randy