From: Yulei Zhang <yuleixzhang@xxxxxxxxxxx> It adds mmap support. Note the file will be extended if it's beyond mmap's offset, that drops the requirement of write() operation, however, it has not supported cutting file down. Signed-off-by: Xiao Guangrong <gloryxiao@xxxxxxxxxxx> Signed-off-by: Yulei Zhang <yuleixzhang@xxxxxxxxxxx> --- fs/dmemfs/inode.c | 337 ++++++++++++++++++++++++++++++++++++++++++- include/linux/dmem.h | 10 ++ 2 files changed, 345 insertions(+), 2 deletions(-) diff --git a/fs/dmemfs/inode.c b/fs/dmemfs/inode.c index 6a8a2d9f94e9..21d2f951b4ea 100644 --- a/fs/dmemfs/inode.c +++ b/fs/dmemfs/inode.c @@ -26,6 +26,7 @@ #include <linux/pagevec.h> #include <linux/fs_parser.h> #include <linux/seq_file.h> +#include <linux/dmem.h> MODULE_AUTHOR("Tencent Corporation"); MODULE_LICENSE("GPL v2"); @@ -105,8 +106,250 @@ static const struct inode_operations dmemfs_file_inode_operations = { .getattr = simple_getattr, }; +static unsigned long dmem_pgoff_to_index(struct inode *inode, pgoff_t pgoff) +{ + struct super_block *sb = inode->i_sb; + + return pgoff >> (sb->s_blocksize_bits - PAGE_SHIFT); +} + +static void *dmem_addr_to_entry(struct inode *inode, phys_addr_t addr) +{ + struct super_block *sb = inode->i_sb; + + addr >>= sb->s_blocksize_bits; + return xa_mk_value(addr); +} + +static phys_addr_t dmem_entry_to_addr(struct inode *inode, void *entry) +{ + struct super_block *sb = inode->i_sb; + + WARN_ON(!xa_is_value(entry)); + return xa_to_value(entry) << sb->s_blocksize_bits; +} + +static unsigned long +dmem_addr_to_pfn(struct inode *inode, phys_addr_t addr, pgoff_t pgoff, + unsigned int fault_shift) +{ + struct super_block *sb = inode->i_sb; + unsigned long pfn = addr >> PAGE_SHIFT; + unsigned long mask; + + mask = (1UL << ((unsigned int)sb->s_blocksize_bits - fault_shift)) - 1; + mask <<= fault_shift - PAGE_SHIFT; + + return pfn + (pgoff & mask); +} + +static inline unsigned long dmem_page_size(struct inode *inode) +{ + return inode->i_sb->s_blocksize; +} + +static int check_inode_size(struct inode *inode, loff_t offset) +{ + WARN_ON_ONCE(!rcu_read_lock_held()); + + if (offset >= i_size_read(inode)) + return -EINVAL; + + return 0; +} + +static unsigned +dmemfs_find_get_entries(struct address_space *mapping, unsigned long start, + unsigned int nr_entries, void **entries, + unsigned long *indices) +{ + XA_STATE(xas, &mapping->i_pages, start); + + void *entry; + unsigned int ret = 0; + + if (!nr_entries) + return 0; + + rcu_read_lock(); + + xas_for_each(&xas, entry, ULONG_MAX) { + if (xas_retry(&xas, entry)) + continue; + + if (xa_is_value(entry)) + goto export; + + if (unlikely(entry != xas_reload(&xas))) + goto retry; + +export: + indices[ret] = xas.xa_index; + entries[ret] = entry; + if (++ret == nr_entries) + break; + continue; +retry: + xas_reset(&xas); + } + rcu_read_unlock(); + return ret; +} + +static void *find_radix_entry_or_next(struct address_space *mapping, + unsigned long start, + unsigned long *eindex) +{ + void *entry = NULL; + + dmemfs_find_get_entries(mapping, start, 1, &entry, eindex); + return entry; +} + +/* + * find the entry in radix tree based on @index, create it if + * it does not exist + * + * return the entry with rcu locked, otherwise ERR_PTR() + * is returned + */ +static void * +radix_get_create_entry(struct vm_area_struct *vma, unsigned long fault_addr, + struct inode *inode, pgoff_t pgoff) +{ + struct address_space *mapping = inode->i_mapping; + unsigned long eindex, index; + loff_t offset; + phys_addr_t addr; + gfp_t gfp_masks = mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM; + void *entry; + unsigned int try_dpages, dpages; + int ret; + +retry: + offset = ((loff_t)pgoff << PAGE_SHIFT); + index = dmem_pgoff_to_index(inode, pgoff); + rcu_read_lock(); + ret = check_inode_size(inode, offset); + if (ret) { + rcu_read_unlock(); + return ERR_PTR(ret); + } + + try_dpages = dmem_pgoff_to_index(inode, (i_size_read(inode) - offset) + >> PAGE_SHIFT); + entry = find_radix_entry_or_next(mapping, index, &eindex); + if (entry) { + WARN_ON(!xa_is_value(entry)); + if (eindex == index) + return entry; + + WARN_ON(eindex <= index); + try_dpages = eindex - index; + } + rcu_read_unlock(); + + /* entry does not exist, create it */ + addr = dmem_alloc_pages_vma(vma, fault_addr, try_dpages, &dpages); + if (!addr) { + /* + * do not return -ENOMEM as that will trigger OOM, + * it is useless for reclaiming dmem page + */ + ret = -EINVAL; + goto exit; + } + + try_dpages = dpages; + while (dpages) { + rcu_read_lock(); + ret = check_inode_size(inode, offset); + if (ret) + goto unlock_rcu; + + entry = dmem_addr_to_entry(inode, addr); + entry = xa_store(&mapping->i_pages, index, entry, gfp_masks); + if (!xa_is_err(entry)) { + addr += inode->i_sb->s_blocksize; + offset += inode->i_sb->s_blocksize; + dpages--; + mapping->nrexceptional++; + index++; + } + +unlock_rcu: + rcu_read_unlock(); + if (ret) + break; + } + + if (dpages) + dmem_free_pages(addr, dpages); + + /* we have created some entries, let's retry it */ + if (ret == -EEXIST || try_dpages != dpages) + goto retry; +exit: + return ERR_PTR(ret); +} + +static void radix_put_entry(void) +{ + rcu_read_unlock(); +} + +static vm_fault_t dmemfs_fault(struct vm_fault *vmf) +{ + struct vm_area_struct *vma = vmf->vma; + struct inode *inode = file_inode(vma->vm_file); + phys_addr_t addr; + void *entry; + int ret; + + if (vmf->pgoff > (MAX_LFS_FILESIZE >> PAGE_SHIFT)) + return VM_FAULT_SIGBUS; + + entry = radix_get_create_entry(vma, (unsigned long)vmf->address, + inode, vmf->pgoff); + if (IS_ERR(entry)) { + ret = PTR_ERR(entry); + goto exit; + } + + addr = dmem_entry_to_addr(inode, entry); + ret = vmf_insert_pfn(vma, (unsigned long)vmf->address, + dmem_addr_to_pfn(inode, addr, vmf->pgoff, + PAGE_SHIFT)); + radix_put_entry(); + +exit: + return ret; +} + +static unsigned long dmemfs_pagesize(struct vm_area_struct *vma) +{ + return dmem_page_size(file_inode(vma->vm_file)); +} + +static const struct vm_operations_struct dmemfs_vm_ops = { + .fault = dmemfs_fault, + .pagesize = dmemfs_pagesize, +}; + int dmemfs_file_mmap(struct file *file, struct vm_area_struct *vma) { + struct inode *inode = file_inode(file); + + if (vma->vm_pgoff & ((dmem_page_size(inode) - 1) >> PAGE_SHIFT)) + return -EINVAL; + + if (!(vma->vm_flags & VM_SHARED)) + return -EINVAL; + + vma->vm_flags |= VM_PFNMAP; + + file_accessed(file); + vma->vm_ops = &dmemfs_vm_ops; return 0; } @@ -189,9 +432,86 @@ static int dmemfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } +/* + * should make sure the dmem page in the dropped region is not + * being mapped by any process + */ +static void inode_drop_dpages(struct inode *inode, loff_t start, loff_t end) +{ + struct address_space *mapping = inode->i_mapping; + struct pagevec pvec; + unsigned long istart, iend, indices[PAGEVEC_SIZE]; + int i; + + /* we never use normap page */ + WARN_ON(mapping->nrpages); + + /* if no dpage is allocated for the inode */ + if (!mapping->nrexceptional) + return; + + istart = dmem_pgoff_to_index(inode, start >> PAGE_SHIFT); + iend = dmem_pgoff_to_index(inode, end >> PAGE_SHIFT); + pagevec_init(&pvec); + while (istart < iend) { + pvec.nr = dmemfs_find_get_entries(mapping, istart, + min(iend - istart, + (unsigned long)PAGEVEC_SIZE), + (void **)pvec.pages, + indices); + if (!pvec.nr) + break; + + for (i = 0; i < pagevec_count(&pvec); i++) { + phys_addr_t addr; + + istart = indices[i]; + if (istart >= iend) + break; + + xa_erase(&mapping->i_pages, istart); + mapping->nrexceptional--; + + addr = dmem_entry_to_addr(inode, pvec.pages[i]); + dmem_free_page(addr); + } + + /* + * only exception entries in pagevec, it's safe to + * reinit it + */ + pagevec_reinit(&pvec); + cond_resched(); + istart++; + } +} + +static void dmemfs_evict_inode(struct inode *inode) +{ + /* no VMA works on it */ + WARN_ON(!RB_EMPTY_ROOT(&inode->i_data.i_mmap.rb_root)); + + inode_drop_dpages(inode, 0, LLONG_MAX); + clear_inode(inode); +} + +/* + * Display the mount options in /proc/mounts. + */ +static int dmemfs_show_options(struct seq_file *m, struct dentry *root) +{ + struct dmemfs_fs_info *fsi = root->d_sb->s_fs_info; + + if (check_dpage_size(fsi->mount_opts.dpage_size)) + seq_printf(m, ",pagesize=%lx", fsi->mount_opts.dpage_size); + return 0; +} + static const struct super_operations dmemfs_ops = { .statfs = dmemfs_statfs, + .evict_inode = dmemfs_evict_inode, .drop_inode = generic_delete_inode, + .show_options = dmemfs_show_options, }; static int @@ -199,6 +519,7 @@ dmemfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct inode *inode; struct dmemfs_fs_info *fsi = sb->s_fs_info; + int ret; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_blocksize = fsi->mount_opts.dpage_size; @@ -207,11 +528,17 @@ dmemfs_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_op = &dmemfs_ops; sb->s_time_gran = 1; + ret = dmem_alloc_init(sb->s_blocksize_bits); + if (ret) + return ret; + inode = dmemfs_get_inode(sb, NULL, S_IFDIR, 0); sb->s_root = d_make_root(inode); - if (!sb->s_root) - return -ENOMEM; + if (!sb->s_root) { + dmem_alloc_uinit(); + return -ENOMEM; + } return 0; } @@ -247,7 +574,13 @@ int dmemfs_init_fs_context(struct fs_context *fc) static void dmemfs_kill_sb(struct super_block *sb) { + bool has_inode = !!sb->s_root; + kill_litter_super(sb); + + /* do not uninit dmem allocator if mount failed */ + if (has_inode) + dmem_alloc_uinit(); } static struct file_system_type dmemfs_fs_type = { diff --git a/include/linux/dmem.h b/include/linux/dmem.h index 476a82e8f252..8682d63ed43a 100644 --- a/include/linux/dmem.h +++ b/include/linux/dmem.h @@ -10,6 +10,16 @@ int dmem_region_register(int node, phys_addr_t start, phys_addr_t end); int dmem_alloc_init(unsigned long dpage_shift); void dmem_alloc_uinit(void); +phys_addr_t +dmem_alloc_pages_nodemask(int nid, nodemask_t *nodemask, unsigned int try_max, + unsigned int *result_nr); + +phys_addr_t +dmem_alloc_pages_vma(struct vm_area_struct *vma, unsigned long addr, + unsigned int try_max, unsigned int *result_nr); + +void dmem_free_pages(phys_addr_t addr, unsigned int dpages_nr); +#define dmem_free_page(addr) dmem_free_pages(addr, 1) #else static inline int dmem_reserve_init(void) { -- 2.28.0