Negative dentry bloat is a well-known problem. For systems without memory pressure, some workloads (like repeated stat calls) can create an unbounded amount of negative dentries quite quickly. In the best case, these dentries could speed up a subsequent name lookup, but in the worst case, they are never used and their memory never freed. While systems without memory pressure may not need that memory for other purposes, negative dentry bloat can have other side-effects, such as soft lockups when traversing the d_subdirs list or general slowness with managing them. It is a good idea to have some sort of mechanism for controlling negative dentries, even outside memory pressure. This patch attempts to do so in a fair way. Workloads which create many negative dentries must create many dentries, or convert dentries from positive to negative. Thus, negative dentry management is best done during these same operations, as it will amortize its cost, and distribute the cost to the perpetrators of the dentry bloat. We introduce a sysctl "negative-dentry-ratio" which sets a maximum number of negative dentries per positive dentry, N:1. When a dentry is created or unlinked, the next N+1 dentries of the parent are scanned. If no positive dentries are found, then a candidate negative dentry is killed. Signed-off-by: Stephen Brennan <stephen.s.brennan@xxxxxxxxxx> --- fs/dcache.c | 93 +++++++++++++++++++++++++++++++++++++++++- include/linux/dcache.h | 1 + 2 files changed, 93 insertions(+), 1 deletion(-) diff --git a/fs/dcache.c b/fs/dcache.c index b1480433ddc5..ba79107a8268 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -74,6 +74,8 @@ int sysctl_vfs_cache_pressure __read_mostly = 100; EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); +unsigned int sysctl_negative_dentry_ratio = 5; + __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); EXPORT_SYMBOL(rename_lock); @@ -183,6 +185,7 @@ static int proc_nr_dentry(struct ctl_table *table, int write, void *buffer, return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } +const unsigned int max_negative_dentry_ratio = 20; static struct ctl_table fs_dcache_sysctls[] = { { .procname = "dentry-state", @@ -191,6 +194,15 @@ static struct ctl_table fs_dcache_sysctls[] = { .mode = 0444, .proc_handler = proc_nr_dentry, }, + { + .procname = "negative-dentry-ratio", + .data = &sysctl_negative_dentry_ratio, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = (void *)&max_negative_dentry_ratio, + }, { } }; @@ -547,6 +559,8 @@ static inline void dentry_unlist(struct dentry *dentry, struct dentry *parent) dentry->d_flags |= DCACHE_DENTRY_KILLED; if (unlikely(list_empty(&dentry->d_child))) return; + if (!IS_ROOT(dentry) && unlikely(dentry->d_parent->d_scan_cursor == &dentry->d_child)) + dentry->d_parent->d_scan_cursor = dentry->d_child.next; __list_del_entry(&dentry->d_child); /* * Cursors can move around the list of children. While we'd been @@ -1751,6 +1765,71 @@ void d_invalidate(struct dentry *dentry) } EXPORT_SYMBOL(d_invalidate); +/** + * Enforce a requirement that a directory never contains more than N negative + * dentries per positive dentry. Scan N + 1 dentries. If no positive dentries + * are found, then kill one negative dentry. parent's lock must be held, and it + * will be released by this function. + */ +static void dentry_incremental_scan(struct dentry *parent) +{ + struct dentry *dentry, *candidate = NULL; + struct list_head *cursor, *start; + unsigned int flags; + int refcount; + int nr_to_scan = sysctl_negative_dentry_ratio + 1; + int nr_positive = 0; + + if (nr_to_scan == 1) + goto out_unlock; + + cursor = start = parent->d_scan_cursor ?: parent->d_subdirs.next; + + rcu_read_lock(); + while (nr_to_scan--) { + if (cursor == &parent->d_subdirs) + goto next; + dentry = container_of(cursor, struct dentry, d_child); + flags = READ_ONCE(dentry->d_flags); + refcount = READ_ONCE(dentry->d_lockref.count); + if (d_flags_negative(flags)) { + if (!refcount && !dentry->d_inode && !candidate) + candidate = dentry; + } else { + nr_positive++; + } + next: + cursor = cursor->next; + if (cursor == start) { + rcu_read_unlock(); + goto out_unlock; + } + } + + parent->d_scan_cursor = cursor; + if (nr_positive || !candidate) { + rcu_read_unlock(); + goto out_unlock; + } + + spin_lock(&candidate->d_lock); + rcu_read_unlock(); + + /* Need to re-read candidate's flags and inode. */ + if (!d_is_negative(candidate) || candidate->d_inode || + candidate->d_lockref.count) { + spin_unlock(&candidate->d_lock); + goto out_unlock; + } + /* No need to cond_resched(); we're not repeating this operation */ + __dentry_kill(candidate, false); + /* parent->d_lock is now unlocked */ + return; + +out_unlock: + spin_unlock(&parent->d_lock); +} + /** * __d_alloc - allocate a dcache entry * @sb: filesystem it will belong to @@ -1814,6 +1893,7 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dentry->d_sb = sb; dentry->d_op = NULL; dentry->d_fsdata = NULL; + dentry->d_scan_cursor = NULL; INIT_HLIST_BL_NODE(&dentry->d_hash); INIT_LIST_HEAD(&dentry->d_lru); INIT_LIST_HEAD(&dentry->d_subdirs); @@ -1858,7 +1938,7 @@ struct dentry *d_alloc(struct dentry * parent, const struct qstr *name) __dget_dlock(parent); dentry->d_parent = parent; list_add(&dentry->d_child, &parent->d_subdirs); - spin_unlock(&parent->d_lock); + dentry_incremental_scan(parent); /* unlocks parent */ return dentry; } @@ -2521,6 +2601,7 @@ EXPORT_SYMBOL(d_hash_and_lookup); void d_delete(struct dentry * dentry) { struct inode *inode = dentry->d_inode; + struct dentry *parent = NULL; spin_lock(&inode->i_lock); spin_lock(&dentry->d_lock); @@ -2529,7 +2610,17 @@ void d_delete(struct dentry * dentry) */ if (dentry->d_lockref.count == 1) { dentry->d_flags &= ~DCACHE_CANT_MOUNT; + if (!IS_ROOT(dentry)) + parent = dentry->d_parent; dentry_unlink_inode(dentry); + /* + * Since we have created a negative dentry, continue the + * incremental scan to keep enforcing negative dentry ratio. + */ + if (parent) { + spin_lock(&parent->d_lock); + dentry_incremental_scan(parent); /* unlocks parent */ + } } else { __d_drop(dentry); spin_unlock(&dentry->d_lock); diff --git a/include/linux/dcache.h b/include/linux/dcache.h index f5bba51480b2..59c240d8c493 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -102,6 +102,7 @@ struct dentry { }; struct list_head d_child; /* child of parent list */ struct list_head d_subdirs; /* our children */ + struct list_head *d_scan_cursor; /* * d_alias and d_rcu can share memory */ -- 2.30.2