On Thu, Jun 24, 2010 at 01:03:02PM +1000, npiggin@xxxxxxx wrote: > Allow the shrinker to do per-zone shrinking. This means it is called for > each zone scanned. The shrinker is now completely responsible for calculating > and batching (given helpers), which provides better flexibility. > > Finding the ratio of objects to scan requires scaling the ratio of pagecache > objects scanned. By passing down both the per-zone and the global reclaimable > pages, per-zone caches and global caches can be calculated correctly. > > Finally, add some fixed-point scaling to the ratio, which helps calculations. > > Signed-off-by: Nick Piggin <npiggin@xxxxxxx> > --- > fs/dcache.c | 2 > fs/drop_caches.c | 2 > fs/inode.c | 2 > fs/mbcache.c | 4 - > fs/nfs/dir.c | 2 > fs/nfs/internal.h | 2 > fs/quota/dquot.c | 2 > include/linux/mm.h | 6 +- > mm/vmscan.c | 131 ++++++++++++++--------------------------------------- > 9 files changed, 47 insertions(+), 106 deletions(-) The diffstat doesn't match the patch ;) > Index: linux-2.6/include/linux/mm.h > =================================================================== > --- linux-2.6.orig/include/linux/mm.h > +++ linux-2.6/include/linux/mm.h > @@ -999,16 +999,19 @@ static inline void sync_mm_rss(struct ta > * querying the cache size, so a fastpath for that case is appropriate. > */ > struct shrinker { > - int (*shrink)(int nr_to_scan, gfp_t gfp_mask); > - int seeks; /* seeks to recreate an obj */ > - > + int (*shrink)(struct zone *zone, unsigned long scanned, unsigned long total, > + unsigned long global, gfp_t gfp_mask); Can we add the shrinker structure to taht callback, too, so that we can get away from needing global context for the shrinker? > +unsigned long shrinker_do_scan(unsigned long *dst, unsigned long batch) > +{ > + unsigned long nr = ACCESS_ONCE(*dst); What's the point of ACCESS_ONCE() here? /me gets most of the way into the patch Oh, it's because you are using static variables for nr_to_scan and hence when concurrent shrinkers are running they are all incrementing and decrementing the same variable. That doesn't sound like a good idea to me - concurrent shrinkers are much more likely with per-zone shrinker callouts. It seems to me that a reclaim thread could be kept in a shrinker long after it has run it's scan count if new shrinker calls from a different reclaim context occur before the first has finished.... As a further question - why do some shrinkerѕ get converted to a single global nr_to_scan, and others get converted to a private nr_to_scan? Shouldn't they all use the same method? The static variable method looks to me to be full of races - concurrent callers to shrinker_add_scan() does not look at all thread safe to me. > + if (nr < batch) > + return 0; Why wouldn't we return nr here to drain the remaining objects? Doesn't this mean we can't shrink caches that have a scan count of less than SHRINK_BATCH? > + *dst = nr - batch; Similarly, that is not a threadsafe update. > + return batch; > +} > +EXPORT_SYMBOL(shrinker_do_scan); > + > /* > * Call the shrink functions to age shrinkable caches > * > @@ -198,8 +228,8 @@ EXPORT_SYMBOL(unregister_shrinker); > * > * Returns the number of slab objects which we shrunk. > */ > -unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, > - unsigned long lru_pages) > +static unsigned long shrink_slab(struct zone *zone, unsigned long scanned, unsigned long total, > + unsigned long global, gfp_t gfp_mask) > { > struct shrinker *shrinker; > unsigned long ret = 0; > @@ -211,55 +241,25 @@ unsigned long shrink_slab(unsigned long > return 1; /* Assume we'll be able to shrink next time */ > > list_for_each_entry(shrinker, &shrinker_list, list) { > - unsigned long long delta; > - unsigned long total_scan; > - unsigned long max_pass = (*shrinker->shrink)(0, gfp_mask); > - > - delta = (4 * scanned) / shrinker->seeks; > - delta *= max_pass; > - do_div(delta, lru_pages + 1); > - shrinker->nr += delta; > - if (shrinker->nr < 0) { > - printk(KERN_ERR "shrink_slab: %pF negative objects to " > - "delete nr=%ld\n", > - shrinker->shrink, shrinker->nr); > - shrinker->nr = max_pass; > - } > - > - /* > - * Avoid risking looping forever due to too large nr value: > - * never try to free more than twice the estimate number of > - * freeable entries. > - */ > - if (shrinker->nr > max_pass * 2) > - shrinker->nr = max_pass * 2; > - > - total_scan = shrinker->nr; > - shrinker->nr = 0; > - > - while (total_scan >= SHRINK_BATCH) { > - long this_scan = SHRINK_BATCH; > - int shrink_ret; > - int nr_before; > - > - nr_before = (*shrinker->shrink)(0, gfp_mask); > - shrink_ret = (*shrinker->shrink)(this_scan, gfp_mask); > - if (shrink_ret == -1) > - break; > - if (shrink_ret < nr_before) > - ret += nr_before - shrink_ret; > - count_vm_events(SLABS_SCANNED, this_scan); > - total_scan -= this_scan; > - > - cond_resched(); Removing this means we need cond_resched() in all shrinker loops now to maintain the same latencies as we currently have. I note that you've done this for most of the shrinkers, but the documentation needs to be updated to mention this... > - } > - > - shrinker->nr += total_scan; And dropping this means we do not carry over the remainder of the previous scan into the next scan. This means we could be scanning a lot less with this new code. > + (*shrinker->shrink)(zone, scanned, total, global, gfp_mask); > } > up_read(&shrinker_rwsem); > return ret; > } > > +void shrink_all_slab(void) > +{ > + struct zone *zone; > + unsigned long nr; > + > +again: > + nr = 0; > + for_each_zone(zone) > + nr += shrink_slab(zone, 1, 1, 1, GFP_KERNEL); > + if (nr >= 10) > + goto again; do { nr = 0; for_each_zone(zone) nr += shrink_slab(zone, 1, 1, 1, GFP_KERNEL); } while (nr >= 10); > @@ -1705,6 +1708,23 @@ static void shrink_zone(int priority, st > if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0) > shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); > > + /* > + * Don't shrink slabs when reclaiming memory from > + * over limit cgroups > + */ > + if (scanning_global_lru(sc)) { > + struct reclaim_state *reclaim_state = current->reclaim_state; > + > + shrink_slab(zone, sc->nr_scanned - nr_scanned, > + lru_pages, global_lru_pages, sc->gfp_mask); > + if (reclaim_state) { > + nr_reclaimed += reclaim_state->reclaimed_slab; > + reclaim_state->reclaimed_slab = 0; > + } > + } So effectively we are going to be calling shrink_slab() once per zone instead of once per priority loop, right? That means we are going to be doing a lot more concurrent shrink_slab() calls that the current code. Combine that with the removal of residual aggregation, I think this will alter the reclaim balance somewhat. Have you tried to quantify this? > Index: linux-2.6/fs/dcache.c > =================================================================== > --- linux-2.6.orig/fs/dcache.c > +++ linux-2.6/fs/dcache.c > @@ -748,20 +748,26 @@ again2: > * > * This function may fail to free any resources if all the dentries are in use. > */ > -static void prune_dcache(int count) > +static void prune_dcache(struct zone *zone, unsigned long scanned, > + unsigned long total, gfp_t gfp_mask) > + > { > + unsigned long nr_to_scan; > struct super_block *sb, *n; > int w_count; > - int unused = dentry_stat.nr_unused; > int prune_ratio; > - int pruned; > + int count, pruned; > > - if (unused == 0 || count == 0) > + shrinker_add_scan(&nr_to_scan, scanned, total, dentry_stat.nr_unused, > + DEFAULT_SEEKS * sysctl_vfs_cache_pressure / 100); > +done: > + count = shrinker_do_scan(&nr_to_scan, SHRINK_BATCH); > + if (dentry_stat.nr_unused == 0 || count == 0) > return; > - if (count >= unused) > + if (count >= dentry_stat.nr_unused) > prune_ratio = 1; > else > - prune_ratio = unused / count; > + prune_ratio = dentry_stat.nr_unused / count; > spin_lock(&sb_lock); > list_for_each_entry_safe(sb, n, &super_blocks, s_list) { > if (list_empty(&sb->s_instances)) > @@ -810,6 +816,10 @@ static void prune_dcache(int count) > break; > } > spin_unlock(&sb_lock); > + if (count <= 0) { > + cond_resched(); > + goto done; > + } > } > > /** > @@ -1176,19 +1186,15 @@ EXPORT_SYMBOL(shrink_dcache_parent); > * > * In this case we return -1 to tell the caller that we baled. > */ > -static int shrink_dcache_memory(int nr, gfp_t gfp_mask) > +static int shrink_dcache_memory(struct zone *zone, unsigned long scanned, > + unsigned long total, unsigned long global, gfp_t gfp_mask) > { > - if (nr) { > - if (!(gfp_mask & __GFP_FS)) > - return -1; > - prune_dcache(nr); > - } > - return (dentry_stat.nr_unused / 100) * sysctl_vfs_cache_pressure; > + prune_dcache(zone, scanned, global, gfp_mask); > + return 0; > } I would have thought that putting the shrinker_add_scan/ shrinker_do_scan loop in shrink_dcache_memory() and leaving prune_dcache untouched would have been a better separation. I note that this is what you did with prune_icache(), so consistency between the two would be good ;) Also, this patch drops the __GFP_FS check from the dcache shrinker - not intentional, right? > @@ -211,28 +215,38 @@ mb_cache_shrink_fn(int nr_to_scan, gfp_t > atomic_read(&cache->c_entry_count)); > count += atomic_read(&cache->c_entry_count); > } > + shrinker_add_scan(&nr_to_scan, scanned, global, count, > + DEFAULT_SEEKS * sysctl_vfs_cache_pressure / 100); > mb_debug("trying to free %d entries", nr_to_scan); > - if (nr_to_scan == 0) { > + > +again: > + nr = shrinker_do_scan(&nr_to_scan, SHRINK_BATCH); > + if (!nr) { > spin_unlock(&mb_cache_spinlock); > - goto out; > + return 0; > } > - while (nr_to_scan-- && !list_empty(&mb_cache_lru_list)) { > + while (!list_empty(&mb_cache_lru_list)) { > struct mb_cache_entry *ce = > list_entry(mb_cache_lru_list.next, > struct mb_cache_entry, e_lru_list); > list_move_tail(&ce->e_lru_list, &free_list); > __mb_cache_entry_unhash(ce); > + cond_resched_lock(&mb_cache_spinlock); > + if (!--nr) > + break; > } > spin_unlock(&mb_cache_spinlock); > list_for_each_safe(l, ltmp, &free_list) { > __mb_cache_entry_forget(list_entry(l, struct mb_cache_entry, > e_lru_list), gfp_mask); > } > -out: > - return (count / 100) * sysctl_vfs_cache_pressure; > + if (!nr) { > + spin_lock(&mb_cache_spinlock); > + goto again; > + } Another candidate for a do-while loop. > + return 0; > } > > - > /* > * mb_cache_create() create a new cache > * > Index: linux-2.6/fs/nfs/dir.c > =================================================================== > --- linux-2.6.orig/fs/nfs/dir.c > +++ linux-2.6/fs/nfs/dir.c > @@ -1709,21 +1709,31 @@ static void nfs_access_free_list(struct > } > } > > -int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask) > +int nfs_access_cache_shrinker(struct zone *zone, unsigned long scanned, > + unsigned long total, unsigned long global, gfp_t gfp_mask) > { > + static unsigned long nr_to_scan; > LIST_HEAD(head); > - struct nfs_inode *nfsi; > struct nfs_access_entry *cache; > - > - if ((gfp_mask & GFP_KERNEL) != GFP_KERNEL) > - return (nr_to_scan == 0) ? 0 : -1; > + unsigned long nr; > > spin_lock(&nfs_access_lru_lock); > - list_for_each_entry(nfsi, &nfs_access_lru_list, access_cache_inode_lru) { > + shrinker_add_scan(&nr_to_scan, scanned, global, > + atomic_long_read(&nfs_access_nr_entries), > + DEFAULT_SEEKS * sysctl_vfs_cache_pressure / 100); > + if (!(gfp_mask & __GFP_FS) || nr_to_scan < SHRINK_BATCH) { > + spin_unlock(&nfs_access_lru_lock); > + return 0; > + } > + nr = ACCESS_ONCE(nr_to_scan); > + nr_to_scan = 0; That's not safe for concurrent callers. Both could get nr = nr_to_scan rather than nr(1) = nr_to_scan and nr(2) = 0 which I think is the intent.... > Index: linux-2.6/arch/x86/kvm/mmu.c > =================================================================== > --- linux-2.6.orig/arch/x86/kvm/mmu.c > +++ linux-2.6/arch/x86/kvm/mmu.c > @@ -2924,14 +2924,29 @@ static int kvm_mmu_remove_some_alloc_mmu > return kvm_mmu_zap_page(kvm, page) + 1; > } > > -static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask) > +static int mmu_shrink(struct zone *zone, unsigned long scanned, > + unsigned long total, unsigned long global, gfp_t gfp_mask) > { > + static unsigned long nr_to_scan; > struct kvm *kvm; > struct kvm *kvm_freed = NULL; > - int cache_count = 0; > + unsigned long cache_count = 0; > > spin_lock(&kvm_lock); > + list_for_each_entry(kvm, &vm_list, vm_list) { > + cache_count += kvm->arch.n_alloc_mmu_pages - > + kvm->arch.n_free_mmu_pages; > + } > > + shrinker_add_scan(&nr_to_scan, scanned, global, cache_count, > + DEFAULT_SEEKS*10); > + > +done: > + cache_count = shrinker_do_scan(&nr_to_scan, SHRINK_BATCH); > + if (!cache_count) { > + spin_unlock(&kvm_lock); > + return 0; > + } I note that this use of a static scan count is thread safe because all the calculations are done under the kvm_lock. THat's three different ways the shrinkers implement the same functionality now.... > Index: linux-2.6/fs/xfs/linux-2.6/xfs_sync.c > =================================================================== > --- linux-2.6.orig/fs/xfs/linux-2.6/xfs_sync.c > +++ linux-2.6/fs/xfs/linux-2.6/xfs_sync.c > @@ -838,43 +838,52 @@ static struct rw_semaphore xfs_mount_lis > > static int > xfs_reclaim_inode_shrink( > - int nr_to_scan, > + struct zone *zone, > + unsigned long scanned, > + unsigned long total, > + unsigned long global, > gfp_t gfp_mask) > { > + static unsigned long nr_to_scan; > + int nr; > struct xfs_mount *mp; > struct xfs_perag *pag; > xfs_agnumber_t ag; > - int reclaimable = 0; > - > - if (nr_to_scan) { > - if (!(gfp_mask & __GFP_FS)) > - return -1; > - > - down_read(&xfs_mount_list_lock); > - list_for_each_entry(mp, &xfs_mount_list, m_mplist) { > - xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0, > - XFS_ICI_RECLAIM_TAG, 1, &nr_to_scan); > - if (nr_to_scan <= 0) > - break; > - } > - up_read(&xfs_mount_list_lock); > - } > + unsigned long nr_reclaimable = 0; > > down_read(&xfs_mount_list_lock); > list_for_each_entry(mp, &xfs_mount_list, m_mplist) { > for (ag = 0; ag < mp->m_sb.sb_agcount; ag++) { > pag = xfs_perag_get(mp, ag); > - reclaimable += pag->pag_ici_reclaimable; > + nr_reclaimable += pag->pag_ici_reclaimable; > xfs_perag_put(pag); > } > } > + shrinker_add_scan(&nr_to_scan, scanned, global, nr_reclaimable, > + DEFAULT_SEEKS); That's not thread safe - it's under a read lock. This code really needs a shrinker context.... > + if (!(gfp_mask & __GFP_FS)) { > + up_read(&xfs_mount_list_lock); > + return 0; > + } > + > +done: > + nr = shrinker_do_scan(&nr_to_scan, SHRINK_BATCH); > + if (!nr) { > + up_read(&xfs_mount_list_lock); > + return 0; > + } > + list_for_each_entry(mp, &xfs_mount_list, m_mplist) { > + xfs_inode_ag_iterator(mp, xfs_reclaim_inode, 0, > + XFS_ICI_RECLAIM_TAG, 1, &nr); > + if (nr <= 0) > + goto done; > + } That's missing conditional reschedules.... Cheers, Dave. -- Dave Chinner david@xxxxxxxxxxxxx -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html