[repost, with more people in the cc: list] We run a cluster of POP and IMAP servers that are used by hundreds of thousands of different users- we see as much as 80.000 different users per server on any given day. The servers have their mailspool NFS mounted, and each user has a unique user-id. We noticed a high CPU load on these servers, sometimes near 100% system time being used on all cores. I used oprofile to find out where the CPU was being burned and it turned out to be in rpcauth_lookup_credcache(). The credential cache hashtable is sized way to small for our purposes- it's just 16 slots, and with 100.000 entries in the generic and unix credential caches the chainlength goes over 6000. It's not that we have tens of thousands of different uids active on the box at the same time, it's just that the box has lots of RAM and all the caches - including the credential caches - just grow and grow. It appears that both the auth_unix and auth_generic credential caches grow at the same rate and are the same size, but most of the entries in the auth_generic credcache are inactive and could be pruned by a rpcauth_prune_expired() run, where the auth_unix credcache contains mostly active (refcounted) entries. The auth_unix credcache can only be pruned by echo 3>/proc/sys/vm/drop_caches. The patch below adds a hashtable size module parameter which if set to e.g. 4096 fixes the problem for us. Note that this is a patch for 2.6.27.x, it applies almost cleanly to 2.6.29.3, but it's a proof-of-concept patch anyway. I'm not sure if this is the right or complete fix, we should look into things like: - is there a way to limit the size of the active number of entries in the auth_unix credential cache ? - should the hashtables be size initially based on the amount of RAM ? - should the hashtables be resized dynamically during use ? - should rpcauth_cache_shrinker be run more often, not only when there is memory pressure but also from say a timer interrupt ? What should be used for nr_to_scan in that case ? - why are both auth_generic and auth_unix used (probably has a good reason but I don't know much of the rpc/auth stuff really) The simple fix would be to size the caches based on some RAM-based heuristic, VFS pressure will then limit the hash chainlength to something reasonable. Correct ? What would that heuristic look like ? # ------ # Detailed patch description: 1. bugfixes: - rpcauth_prune_expired: check nr_to_scan == 0 before decrementing it (in 2.6.29.3 nr_to_scan is decremented but otherwise ignored ?..) - rpcauth_prune_expired: keep cred on unused list when cred->cr_count == 0 and it's not expired yet (also fixed in 2.6.29.3) - set RPC_AUTH_EXPIRY_MORATORIUM to 5*HZ to make it match with the comment ('5 second garbage collection moratorium') (in 2.6.29.3 the comment has been fixed instead) 2. hashtable sizing: rpcauth_init_credcache: allocate hashtables dyamically auth_unix.c: use rpcauth_init_credcache auth_generic.c: use rpcauth_init_credcache sunrpc_syms.c: add hashsize module parameter init_sunrpc: call rpcauth_init_module with hashsize parameter 3. debugging: - rpcauth_prune_expired: don't always delete the entire cred_unused list, active entries are retained. rpcauth_lookup_credcache: add new creds to cred_unused right away, making the cred_unused list really a cred_lru list of all creds - adds sysctl debugging to analyze the number of active and inactive entries on the cred_unused list - add a sysctl knob to read the longest chainlength reached on any hashtable diff -ru orig/linux-2.6.27.21/include/linux/sunrpc/auth.h linux-2.6.27.21/include/linux/sunrpc/auth.h --- orig/linux-2.6.27.21/include/linux/sunrpc/auth.h 2009-03-23 23:04:09.000000000 +0100 +++ linux-2.6.27.21/include/linux/sunrpc/auth.h 2009-05-19 16:02:35.000000000 +0200 @@ -62,8 +62,12 @@ */ #define RPC_CREDCACHE_HASHBITS 4 #define RPC_CREDCACHE_NR (1 << RPC_CREDCACHE_HASHBITS) +#define RPC_CREDCACHE_MIN 4 +#define RPC_CREDCACHE_MAX 16384 struct rpc_cred_cache { - struct hlist_head hashtable[RPC_CREDCACHE_NR]; + int hashsize; + int hashbits; + struct hlist_head *hashtable; spinlock_t lock; }; @@ -124,9 +128,8 @@ extern const struct rpc_authops authunix_ops; extern const struct rpc_authops authnull_ops; -void __init rpc_init_authunix(void); -void __init rpc_init_generic_auth(void); -void __init rpcauth_init_module(void); +int __init rpc_init_generic_auth(void); +int __init rpcauth_init_module(int); void __exit rpcauth_remove_module(void); void __exit rpc_destroy_generic_auth(void); diff -ru orig/linux-2.6.27.21/net/sunrpc/auth.c linux-2.6.27.21/net/sunrpc/auth.c --- orig/linux-2.6.27.21/net/sunrpc/auth.c 2009-03-23 23:04:09.000000000 +0100 +++ linux-2.6.27.21/net/sunrpc/auth.c 2009-05-19 16:02:05.000000000 +0200 @@ -14,6 +14,8 @@ #include <linux/hash.h> #include <linux/sunrpc/clnt.h> #include <linux/spinlock.h> +#include <linux/vmalloc.h> +#include <linux/sysctl.h> #ifdef RPC_DEBUG # define RPCDBG_FACILITY RPCDBG_AUTH @@ -28,6 +30,11 @@ static LIST_HEAD(cred_unused); static unsigned long number_cred_unused; +int credcache_hashsize = RPC_CREDCACHE_NR; + +#ifdef RPC_DEBUG +static int credcache_max_chainlen; +#endif static u32 pseudoflavor_to_flavor(u32 flavor) { @@ -149,7 +156,14 @@ new = kmalloc(sizeof(*new), GFP_KERNEL); if (!new) return -ENOMEM; - for (i = 0; i < RPC_CREDCACHE_NR; i++) + new->hashsize = credcache_hashsize; + new->hashbits = ilog2(new->hashsize); + new->hashtable = vmalloc(new->hashsize * sizeof(struct hlist_head)); + if (!new->hashtable) { + kfree(new); + return -ENOMEM; + } + for (i = 0; i < new->hashsize; i++) INIT_HLIST_HEAD(&new->hashtable[i]); spin_lock_init(&new->lock); auth->au_credcache = new; @@ -186,7 +200,7 @@ spin_lock(&rpc_credcache_lock); spin_lock(&cache->lock); - for (i = 0; i < RPC_CREDCACHE_NR; i++) { + for (i = 0; i < cache->hashsize; i++) { head = &cache->hashtable[i]; while (!hlist_empty(head)) { cred = hlist_entry(head->first, struct rpc_cred, cr_hash); @@ -215,13 +229,15 @@ if (cache) { auth->au_credcache = NULL; rpcauth_clear_credcache(cache); + if (cache->hashtable) + vfree(cache->hashtable); kfree(cache); } } EXPORT_SYMBOL_GPL(rpcauth_destroy_credcache); -#define RPC_AUTH_EXPIRY_MORATORIUM (60 * HZ) +#define RPC_AUTH_EXPIRY_MORATORIUM (5 * HZ) /* * Remove stale credentials. Avoid sleeping inside the loop. @@ -230,13 +246,12 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan) { spinlock_t *cache_lock; - struct rpc_cred *cred; + struct rpc_cred *cred, *n; unsigned long expired = jiffies - RPC_AUTH_EXPIRY_MORATORIUM; - while (!list_empty(&cred_unused)) { - cred = list_entry(cred_unused.next, struct rpc_cred, cr_lru); - list_del_init(&cred->cr_lru); - number_cred_unused--; + list_for_each_entry_safe(cred, n, &cred_unused, cr_lru) { + if (nr_to_scan == 0) + break; if (atomic_read(&cred->cr_count) != 0) continue; /* Enforce a 5 second garbage collection moratorium */ @@ -247,13 +262,12 @@ spin_lock(cache_lock); if (atomic_read(&cred->cr_count) == 0) { get_rpccred(cred); - list_add_tail(&cred->cr_lru, free); + list_move_tail(&cred->cr_lru, free); + number_cred_unused--; rpcauth_unhash_cred_locked(cred); nr_to_scan--; } spin_unlock(cache_lock); - if (nr_to_scan == 0) - break; } return nr_to_scan; } @@ -267,8 +281,6 @@ LIST_HEAD(free); int res; - if (list_empty(&cred_unused)) - return 0; spin_lock(&rpc_credcache_lock); nr_to_scan = rpcauth_prune_expired(&free, nr_to_scan); res = (number_cred_unused / 100) * sysctl_vfs_cache_pressure; @@ -290,11 +302,19 @@ struct rpc_cred *cred = NULL, *entry, *new; unsigned int nr; +#ifdef RPC_DEBUG + int chainlen = 0; +#endif - nr = hash_long(acred->uid, RPC_CREDCACHE_HASHBITS); + nr = hash_long(acred->uid, cache->hashbits); rcu_read_lock(); hlist_for_each_entry_rcu(entry, pos, &cache->hashtable[nr], cr_hash) { +#ifdef RPC_DEBUG + chainlen++; + if (chainlen > credcache_max_chainlen) + credcache_max_chainlen = chainlen; +#endif if (!entry->cr_ops->crmatch(acred, entry, flags)) continue; spin_lock(&cache->lock); @@ -331,6 +351,15 @@ } else list_add_tail(&new->cr_lru, &free); spin_unlock(&cache->lock); +#ifdef RPC_DEBUG + /* Add to LRU list early for statistics */ + if (cred == new) { + spin_lock(&rpc_credcache_lock); + list_add_tail(&cred->cr_lru, &cred_unused); + number_cred_unused++; + spin_unlock(&rpc_credcache_lock); + } +#endif found: if (test_bit(RPCAUTH_CRED_NEW, &cred->cr_flags) && cred->cr_ops->cr_init != NULL @@ -566,19 +595,147 @@ test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0; } +#ifdef RPC_DEBUG +static int proc_credcache_hashsize(struct ctl_table *table, int write, + struct file *file, void __user *buffer, + size_t *length, loff_t *ppos) +{ + int tmp = credcache_hashsize; + + table->data = &tmp; + table->maxlen = sizeof(int); + proc_dointvec(table, write, file, buffer, length, ppos); + if (write) { + if (tmp < RPC_CREDCACHE_MIN || + tmp > RPC_CREDCACHE_MAX || + !is_power_of_2(tmp)) + return -EINVAL; + credcache_hashsize = tmp; + } + return 0; +} + +static int proc_auth_credcache_size(struct ctl_table *table, int write, + struct file *file, void __user *buffer, + size_t *length, loff_t *ppos) +{ + struct rpc_cred *cred; + unsigned long n, used = 0, unused = 0; + int tmp; + + spin_lock(&rpc_credcache_lock); + list_for_each_entry(cred, &cred_unused, cr_lru) { + if (atomic_read(&cred->cr_count)) + used++; + else + unused++; + } + spin_unlock(&rpc_credcache_lock); + + n = table->extra1 ? used : unused; + tmp = n > INT_MAX ? INT_MAX : n; + table->data = &tmp; + table->maxlen = sizeof(int); + + proc_dointvec(table, write, file, buffer, length, ppos); + if (write) { + int nr = unused - tmp; + if (nr > 0) + rpcauth_cache_shrinker(nr, GFP_KERNEL); + } + + return 0; +} + +static ctl_table sunrpc_credcache_knobs_table [] = { + { + .procname = "credcache_hashsize", + .data = NULL, + .mode = 0644, + .proc_handler = &proc_credcache_hashsize, + }, + { + .procname = "credcache_used", + .data = NULL, + .mode = 0444, + .proc_handler = &proc_auth_credcache_size, + .extra1 = (void *)1, + }, + { + .procname = "credcache_unused", + .data = NULL, + .mode = 0644, + .proc_handler = &proc_auth_credcache_size, + .extra1 = (void *)0, + }, + { + .procname = "credcache_total", + .data = &number_cred_unused, + .maxlen = sizeof(unsigned long), + .mode = 0444, + .proc_handler = &proc_doulongvec_minmax, + }, + { + .procname = "credcache_max_chainlen", + .data = &credcache_max_chainlen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, + { + .ctl_name = 0, + } +}; + +static ctl_table sunrpc_credcache_table[] = { + { + .ctl_name = CTL_SUNRPC, + .procname = "sunrpc", + .mode = 0555, + .child = sunrpc_credcache_knobs_table, + }, + { + .ctl_name = 0, + } +}; + +static struct ctl_table_header *sunrpc_credcache_table_header; +#endif + static struct shrinker rpc_cred_shrinker = { .shrink = rpcauth_cache_shrinker, .seeks = DEFAULT_SEEKS, }; -void __init rpcauth_init_module(void) +int __init rpcauth_init_module(int hashsize) { - rpc_init_authunix(); - rpc_init_generic_auth(); + int err; + + if (hashsize) { + hashsize = min(hashsize, RPC_CREDCACHE_MAX); + hashsize = max(hashsize, RPC_CREDCACHE_MIN); + credcache_hashsize = rounddown_pow_of_two(hashsize); + printk(KERN_INFO "RPC: credcache hashtable size %d\n", + credcache_hashsize); + } + + err = rpc_init_generic_auth(); + if (err) + goto out; +#ifdef RPC_DEBUG + sunrpc_credcache_table_header = + register_sysctl_table(sunrpc_credcache_table); +#endif register_shrinker(&rpc_cred_shrinker); +out: + return err; } void __exit rpcauth_remove_module(void) { +#ifdef RPC_DEBUG + if (sunrpc_credcache_table_header) + unregister_sysctl_table(sunrpc_credcache_table_header); +#endif unregister_shrinker(&rpc_cred_shrinker); } diff -ru orig/linux-2.6.27.21/net/sunrpc/auth_generic.c linux-2.6.27.21/net/sunrpc/auth_generic.c --- orig/linux-2.6.27.21/net/sunrpc/auth_generic.c 2009-03-23 23:04:09.000000000 +0100 +++ linux-2.6.27.21/net/sunrpc/auth_generic.c 2009-05-19 12:03:40.000000000 +0200 @@ -26,7 +26,6 @@ }; static struct rpc_auth generic_auth; -static struct rpc_cred_cache generic_cred_cache; static const struct rpc_credops generic_credops; /* @@ -158,20 +157,16 @@ return 0; } -void __init rpc_init_generic_auth(void) +int __init rpc_init_generic_auth(void) { - spin_lock_init(&generic_cred_cache.lock); + return rpcauth_init_credcache(&generic_auth); } void __exit rpc_destroy_generic_auth(void) { - rpcauth_clear_credcache(&generic_cred_cache); + rpcauth_destroy_credcache(&generic_auth); } -static struct rpc_cred_cache generic_cred_cache = { - {{ NULL, },}, -}; - static const struct rpc_authops generic_auth_ops = { .owner = THIS_MODULE, .au_name = "Generic", @@ -182,7 +177,6 @@ static struct rpc_auth generic_auth = { .au_ops = &generic_auth_ops, .au_count = ATOMIC_INIT(0), - .au_credcache = &generic_cred_cache, }; static const struct rpc_credops generic_credops = { diff -ru orig/linux-2.6.27.21/net/sunrpc/auth_unix.c linux-2.6.27.21/net/sunrpc/auth_unix.c --- orig/linux-2.6.27.21/net/sunrpc/auth_unix.c 2009-03-23 23:04:09.000000000 +0100 +++ linux-2.6.27.21/net/sunrpc/auth_unix.c 2009-05-19 14:06:45.000000000 +0200 @@ -28,15 +28,23 @@ #endif static struct rpc_auth unix_auth; -static struct rpc_cred_cache unix_cred_cache; static const struct rpc_credops unix_credops; static struct rpc_auth * unx_create(struct rpc_clnt *clnt, rpc_authflavor_t flavor) { + int err; + dprintk("RPC: creating UNIX authenticator for client %p\n", clnt); atomic_inc(&unix_auth.au_count); + if (!unix_auth.au_credcache) { + err = rpcauth_init_credcache(&unix_auth); + if (err) { + atomic_dec(&unix_auth.au_count); + return ERR_PTR(err); + } + } return &unix_auth; } @@ -202,11 +210,6 @@ return p; } -void __init rpc_init_authunix(void) -{ - spin_lock_init(&unix_cred_cache.lock); -} - const struct rpc_authops authunix_ops = { .owner = THIS_MODULE, .au_flavor = RPC_AUTH_UNIX, @@ -218,17 +221,12 @@ }; static -struct rpc_cred_cache unix_cred_cache = { -}; - -static struct rpc_auth unix_auth = { .au_cslack = UNX_WRITESLACK, .au_rslack = 2, /* assume AUTH_NULL verf */ .au_ops = &authunix_ops, .au_flavor = RPC_AUTH_UNIX, .au_count = ATOMIC_INIT(0), - .au_credcache = &unix_cred_cache, }; static diff -ru orig/linux-2.6.27.21/net/sunrpc/sunrpc_syms.c linux-2.6.27.21/net/sunrpc/sunrpc_syms.c --- orig/linux-2.6.27.21/net/sunrpc/sunrpc_syms.c 2009-03-23 23:04:09.000000000 +0100 +++ linux-2.6.27.21/net/sunrpc/sunrpc_syms.c 2009-05-19 12:53:10.000000000 +0200 @@ -23,6 +23,7 @@ #include <linux/sunrpc/xprtsock.h> extern struct cache_detail ip_map_cache, unix_gid_cache; +static int hashsize; static int __init init_sunrpc(void) @@ -31,13 +32,14 @@ if (err) goto out; err = rpc_init_mempool(); - if (err) { - unregister_rpc_pipefs(); - goto out; - } + if (err) + goto out_err1; #ifdef RPC_DEBUG rpc_register_sysctl(); #endif + err = rpcauth_init_module(hashsize); + if (err) + goto out_err2; #ifdef CONFIG_PROC_FS rpc_proc_init(); #endif @@ -45,7 +47,14 @@ cache_register(&unix_gid_cache); svc_init_xprt_sock(); /* svc sock transport */ init_socket_xprt(); /* clnt sock transport */ - rpcauth_init_module(); + goto out; +out_err2: + rpc_destroy_mempool(); +#ifdef RPC_DEBUG + rpc_unregister_sysctl(); +#endif +out_err1: + unregister_rpc_pipefs(); out: return err; } @@ -67,6 +76,8 @@ rpc_proc_exit(); #endif } +module_param(hashsize, int, 0); +MODULE_PARM_DESC(hashsize, "size of hashtables for credential caches"); MODULE_LICENSE("GPL"); module_init(init_sunrpc); module_exit(cleanup_sunrpc); -- To unsubscribe from this list: send the line "unsubscribe linux-nfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html