Having a limit for the number of negative dentries may have an undesirable side effect that no new negative dentries will be allowed when the limit is reached. This may have a performance impact on some workloads. To prevent this from happening, we need a way to prune the negative dentries so that new ones can be created before it is too late. This is done by using the workqueue API to do the pruning gradually when a threshold is reached to minimize performance impact on other running tasks. The current threshold is 1/4 of the initial value of the free pool count. Once the threshold is reached, the automatic pruning process will be kicked in to replenish the free pool. Each pruning run will scan at most 256 LRU dentries and 64 dentries per node to minimize the LRU locks hold time. The pruning rate will be 50 Hz if the free pool count is less than 1/8 of the original and 10 Hz otherwise. A short artificial delay loop is added to wait for changes in the negative dentry count before killing the negative dentry. Sleeping in this case may be problematic as the callers of dput() may not be in a state that is sleepable. Allowing tasks needing negative dentries to potentially go to do the pruning synchronously themselves can cause lock and cacheline contention. The end result may not be better than that of killing recently created negative dentries. Signed-off-by: Waiman Long <longman@xxxxxxxxxx> --- fs/dcache.c | 156 +++++++++++++++++++++++++++++++++++++++++++++-- include/linux/list_lru.h | 1 + mm/list_lru.c | 4 +- 3 files changed, 155 insertions(+), 6 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index fb7e041..3482972 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -134,13 +134,21 @@ struct dentry_stat_t dentry_stat = { * Macros and variables to manage and count negative dentries. */ #define NEG_DENTRY_BATCH (1 << 8) +#define NEG_PRUNING_SIZE (1 << 6) +#define NEG_PRUNING_SLOW_RATE (HZ/10) +#define NEG_PRUNING_FAST_RATE (HZ/50) static long neg_dentry_percpu_limit __read_mostly; static long neg_dentry_nfree_init __read_mostly; /* Free pool initial value */ static struct { raw_spinlock_t nfree_lock; long nfree; /* Negative dentry free pool */ + struct super_block *prune_sb; /* Super_block for pruning */ + int neg_count, prune_count; /* Pruning counts */ } ndblk ____cacheline_aligned_in_smp; +static void prune_negative_dentry(struct work_struct *work); +static DECLARE_DELAYED_WORK(prune_neg_dentry_work, prune_negative_dentry); + static DEFINE_PER_CPU(long, nr_dentry); static DEFINE_PER_CPU(long, nr_dentry_unused); static DEFINE_PER_CPU(long, nr_dentry_neg); @@ -329,6 +337,15 @@ static void __neg_dentry_inc(struct dentry *dentry) */ if (!cnt) dentry->d_flags |= DCACHE_KILL_NEGATIVE; + + /* + * Initiate negative dentry pruning if free pool has less than + * 1/4 of its initial value. + */ + if (READ_ONCE(ndblk.nfree) < neg_dentry_nfree_init/4) { + WRITE_ONCE(ndblk.prune_sb, dentry->d_sb); + schedule_delayed_work(&prune_neg_dentry_work, 1); + } } static inline void neg_dentry_inc(struct dentry *dentry) @@ -770,10 +787,8 @@ static struct dentry *dentry_kill(struct dentry *dentry) * disappear under the hood even if the dentry * lock is temporarily released. */ - unsigned int dflags; + unsigned int dflags = dentry->d_flags; - dentry->d_flags &= ~DCACHE_KILL_NEGATIVE; - dflags = dentry->d_flags; parent = lock_parent(dentry); /* * Abort the killing if the reference count or @@ -964,8 +979,35 @@ void dput(struct dentry *dentry) dentry_lru_add(dentry); - if (unlikely(dentry->d_flags & DCACHE_KILL_NEGATIVE)) - goto kill_it; + if (unlikely(dentry->d_flags & DCACHE_KILL_NEGATIVE)) { + /* + * Kill the dentry if it is really negative and the per-cpu + * negative dentry count has still exceeded the limit even + * after a short artificial delay. + */ + if (d_is_negative(dentry) && + (this_cpu_read(nr_dentry_neg) > neg_dentry_percpu_limit)) { + int loop = 256; + + /* + * Waiting to transfer free negative dentries from the + * free pool to the percpu count. + */ + while (--loop) { + if (READ_ONCE(ndblk.nfree)) { + long cnt = __neg_dentry_nfree_dec(); + + this_cpu_sub(nr_dentry_neg, cnt); + if (cnt) + break; + } + cpu_relax(); + } + if (!loop) + goto kill_it; + } + dentry->d_flags &= ~DCACHE_KILL_NEGATIVE; + } dentry->d_lockref.count--; spin_unlock(&dentry->d_lock); @@ -1326,6 +1368,110 @@ void shrink_dcache_sb(struct super_block *sb) } EXPORT_SYMBOL(shrink_dcache_sb); +/* + * A modified version that attempts to remove a limited number of negative + * dentries as well as some other non-negative dentries at the front. + */ +static enum lru_status dentry_negative_lru_isolate(struct list_head *item, + struct list_lru_one *lru, spinlock_t *lru_lock, void *arg) +{ + struct list_head *freeable = arg; + struct dentry *dentry = container_of(item, struct dentry, d_lru); + enum lru_status status = LRU_SKIP; + + /* + * Stop further list walking for the current node list to limit + * performance impact, but allow further walking in the next node + * list. + */ + if ((ndblk.neg_count >= NEG_PRUNING_SIZE) || + (ndblk.prune_count >= NEG_PRUNING_SIZE)) { + ndblk.prune_count = 0; + return LRU_STOP; + } + + /* + * we are inverting the lru lock/dentry->d_lock here, + * so use a trylock. If we fail to get the lock, just skip + * it + */ + if (!spin_trylock(&dentry->d_lock)) { + ndblk.prune_count++; + return LRU_SKIP; + } + + /* + * Referenced dentries are still in use. If they have active + * counts, just remove them from the LRU. Otherwise give them + * another pass through the LRU. + */ + if (dentry->d_lockref.count) { + d_lru_isolate(lru, dentry); + status = LRU_REMOVED; + goto out; + } + + /* + * Dentries with reference bit on are moved back to the tail + * except for the negative ones. + */ + if ((dentry->d_flags & DCACHE_REFERENCED) && !d_is_negative(dentry)) { + dentry->d_flags &= ~DCACHE_REFERENCED; + status = LRU_ROTATE; + goto out; + } + + status = LRU_REMOVED; + d_lru_shrink_move(lru, dentry, freeable); + if (d_is_negative(dentry)) + ndblk.neg_count++; +out: + spin_unlock(&dentry->d_lock); + ndblk.prune_count++; + return status; +} + +/* + * A workqueue function to prune negative dentry. + * + * The pruning is done gradually over time so as to have as little + * performance impact as possible. + */ +static void prune_negative_dentry(struct work_struct *work) +{ + int freed; + long nfree; + struct super_block *sb = READ_ONCE(ndblk.prune_sb); + LIST_HEAD(dispose); + + if (!sb) + return; + + ndblk.neg_count = ndblk.prune_count = 0; + freed = list_lru_walk(&sb->s_dentry_lru, dentry_negative_lru_isolate, + &dispose, NEG_DENTRY_BATCH); + + if (freed) + shrink_dentry_list(&dispose); + /* + * Continue delayed pruning until negative dentry free pool is at + * least 1/2 of the initial value or the super_block has no more + * negative dentries left at the front. + * + * The pruning rate depends on the size of the free pool. The + * faster rate is used when there is less than 1/8 left. + * Otherwise, the slower rate will be used. + */ + nfree = READ_ONCE(ndblk.nfree); + if (ndblk.neg_count && (nfree < neg_dentry_nfree_init/2)) + schedule_delayed_work(&prune_neg_dentry_work, + (nfree < neg_dentry_nfree_init/8) + ? NEG_PRUNING_FAST_RATE + : NEG_PRUNING_SLOW_RATE); + else + WRITE_ONCE(ndblk.prune_sb, NULL); +} + /** * enum d_walk_ret - action to talke during tree walk * @D_WALK_CONTINUE: contrinue walk diff --git a/include/linux/list_lru.h b/include/linux/list_lru.h index fa7fd03..06c9d15 100644 --- a/include/linux/list_lru.h +++ b/include/linux/list_lru.h @@ -22,6 +22,7 @@ enum lru_status { LRU_SKIP, /* item cannot be locked, skip */ LRU_RETRY, /* item not freeable. May drop the lock internally, but has to return locked. */ + LRU_STOP, /* stop walking the list */ }; struct list_lru_one { diff --git a/mm/list_lru.c b/mm/list_lru.c index 7a40fa2..f6e7796 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -244,11 +244,13 @@ unsigned long list_lru_count_node(struct list_lru *lru, int nid) */ assert_spin_locked(&nlru->lock); goto restart; + case LRU_STOP: + goto out; default: BUG(); } } - +out: spin_unlock(&nlru->lock); return isolated; } -- 1.8.3.1 -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html