Normally, when accessing a mmapped file, entering the page fault, the file's (->mapping, ->index) will be associated with dax entry(represents for one page or a couple of pages) to facilitate the reverse mapping search. But in the case of reflink, a dax entry may be shared by multiple files or offsets. In order to establish a reverse mapping relationship in this case, I introduce a rb-tree to track multiple files and offsets. The root of the rb-tree is stored in page->private, since I haven't found it be used in fsdax. We create the rb-tree and insert the (->mapping, ->index) tuple in the second time a dax entry is associated, which means this dax entry is shared. And delete this tuple from the rb-tree when disassociating. Signed-off-by: Shiyang Ruan <ruansy.fnst@xxxxxxxxxxxxxx> --- fs/dax.c | 153 ++++++++++++++++++++++++++++++++++++++++---- include/linux/dax.h | 6 ++ 2 files changed, 147 insertions(+), 12 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 11b16729b86f..2f996c566103 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -25,6 +25,7 @@ #include <linux/sizes.h> #include <linux/mmu_notifier.h> #include <linux/iomap.h> +#include <linux/rbtree.h> #include <asm/pgalloc.h> #define CREATE_TRACE_POINTS @@ -310,6 +311,120 @@ static unsigned long dax_entry_size(void *entry) return PAGE_SIZE; } +static struct kmem_cache *dax_rmap_node_cachep; +static struct kmem_cache *dax_rmap_root_cachep; + +static int __init init_dax_rmap_cache(void) +{ + dax_rmap_root_cachep = KMEM_CACHE(rb_root_cached, SLAB_PANIC|SLAB_ACCOUNT); + dax_rmap_node_cachep = KMEM_CACHE(shared_file, SLAB_PANIC|SLAB_ACCOUNT); + return 0; +} +fs_initcall(init_dax_rmap_cache); + +struct rb_root_cached *dax_create_rbroot(void) +{ + struct rb_root_cached *root = kmem_cache_alloc(dax_rmap_root_cachep, + GFP_KERNEL); + + memset(root, 0, sizeof(struct rb_root_cached)); + return root; +} + +static bool dax_rmap_insert(struct page *page, struct address_space *mapping, + pgoff_t index) +{ + struct rb_root_cached *root = (struct rb_root_cached *)page_private(page); + struct rb_node **new, *parent = NULL; + struct shared_file *p; + bool leftmost = true; + + if (!root) { + root = dax_create_rbroot(); + set_page_private(page, (unsigned long)root); + dax_rmap_insert(page, page->mapping, page->index); + } + new = &root->rb_root.rb_node; + /* Figure out where to insert new node */ + while (*new) { + struct shared_file *this = container_of(*new, struct shared_file, node); + long result = (long)mapping - (long)this->mapping; + + if (result == 0) + result = (long)index - (long)this->index; + parent = *new; + if (result < 0) + new = &((*new)->rb_left); + else if (result > 0) { + new = &((*new)->rb_right); + leftmost = false; + } else + return false; + } + p = kmem_cache_alloc(dax_rmap_node_cachep, GFP_KERNEL); + p->mapping = mapping; + p->index = index; + + /* Add new node and rebalance tree. */ + rb_link_node(&p->node, parent, new); + rb_insert_color_cached(&p->node, root, leftmost); + + return true; +} + +static struct shared_file *dax_rmap_search(struct page *page, + struct address_space *mapping, + pgoff_t index) +{ + struct rb_root_cached *root = (struct rb_root_cached *)page_private(page); + struct rb_node *node = root->rb_root.rb_node; + + while (node) { + struct shared_file *this = container_of(node, struct shared_file, node); + long result = (long)mapping - (long)this->mapping; + + if (result == 0) + result = (long)index - (long)this->index; + if (result < 0) + node = node->rb_left; + else if (result > 0) + node = node->rb_right; + else + return this; + } + return NULL; +} + +static void dax_rmap_delete(struct page *page, struct address_space *mapping, + pgoff_t index) +{ + struct rb_root_cached *root = (struct rb_root_cached *)page_private(page); + struct shared_file *this; + + if (!root) { + page->mapping = NULL; + page->index = 0; + return; + } + + this = dax_rmap_search(page, mapping, index); + rb_erase_cached(&this->node, root); + kmem_cache_free(dax_rmap_node_cachep, this); + + if (!RB_EMPTY_ROOT(&root->rb_root)) { + if (page->mapping == mapping && page->index == index) { + this = container_of(rb_first_cached(root), struct shared_file, node); + page->mapping = this->mapping; + page->index = this->index; + } + } else { + kmem_cache_free(dax_rmap_root_cachep, root); + set_page_private(page, 0); + page->mapping = NULL; + page->index = 0; + } +} + static unsigned long dax_end_pfn(void *entry) { return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; @@ -341,16 +456,20 @@ static void dax_associate_entry(void *entry, struct address_space *mapping, for_each_mapped_pfn(entry, pfn) { struct page *page = pfn_to_page(pfn); - WARN_ON_ONCE(page->mapping); - page->mapping = mapping; - page->index = index + i++; + if (!page->mapping) { + page->mapping = mapping; + page->index = index + i++; + } else { + dax_rmap_insert(page, mapping, index + i++); + } } } static void dax_disassociate_entry(void *entry, struct address_space *mapping, - bool trunc) + pgoff_t index, bool trunc) { unsigned long pfn; + int i = 0; if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) return; @@ -359,9 +478,19 @@ static void dax_disassociate_entry(void *entry, struct address_space *mapping, struct page *page = pfn_to_page(pfn); WARN_ON_ONCE(trunc && page_ref_count(page) > 1); - WARN_ON_ONCE(page->mapping && page->mapping != mapping); - page->mapping = NULL; - page->index = 0; + WARN_ON_ONCE(!page->mapping); + dax_rmap_delete(page, mapping, index + i++); + } +} + +static void __dax_decrease_nrexceptional(void *entry, + struct address_space *mapping) +{ + if (dax_is_empty_entry(entry) || dax_is_zero_entry(entry) || + dax_is_pmd_entry(entry)) { + mapping->nrexceptional--; + } else { + mapping->nrexceptional -= PHYS_PFN(dax_entry_size(entry)); } } @@ -522,10 +651,10 @@ static void *grab_mapping_entry(struct xa_state *xas, xas_lock_irq(xas); } - dax_disassociate_entry(entry, mapping, false); + dax_disassociate_entry(entry, mapping, index, false); xas_store(xas, NULL); /* undo the PMD join */ dax_wake_entry(xas, entry, true); - mapping->nrexceptional--; + __dax_decrease_nrexceptional(entry, mapping); entry = NULL; xas_set(xas, index); } @@ -642,9 +771,9 @@ static int __dax_invalidate_entry(struct address_space *mapping, (xas_get_mark(&xas, PAGECACHE_TAG_DIRTY) || xas_get_mark(&xas, PAGECACHE_TAG_TOWRITE))) goto out; - dax_disassociate_entry(entry, mapping, trunc); + dax_disassociate_entry(entry, mapping, index, trunc); xas_store(&xas, NULL); - mapping->nrexceptional--; + __dax_decrease_nrexceptional(entry, mapping); ret = 1; out: put_unlocked_entry(&xas, entry); @@ -737,7 +866,7 @@ static void *dax_insert_entry(struct xa_state *xas, if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) { void *old; - dax_disassociate_entry(entry, mapping, false); + dax_disassociate_entry(entry, mapping, xas->xa_index, false); dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address); /* * Only swap our new entry into the page cache if the current diff --git a/include/linux/dax.h b/include/linux/dax.h index d7af5d243f24..1e2e81c701b6 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -39,6 +39,12 @@ struct dax_operations { int (*zero_page_range)(struct dax_device *, pgoff_t, size_t); }; +struct shared_file { + struct address_space *mapping; + pgoff_t index; + struct rb_node node; +}; + extern struct attribute_group dax_attribute_group; #if IS_ENABLED(CONFIG_DAX) -- 2.26.2