So here is the first version of the hierarchical inotify limits. Changes include: * Added 2 new sysctls: - inotify_reserved_user_instances and inotify_reserved_user_watches these essentially control the distribution of instances/watches down the hierarchy. For example if we have instances/watches limit of 1024/256 and reserved instances/watches are set to 128/32 then at every level of the hierarchy instances/watches are going to be reduced by 128/32, so at userns level of 1 (e.g. init_user_ns->level_1_user_ns) each user would have 896/224 respectively. Currently the defaults are calculated so that at least 8 levels of indirection are allowed. Those can be set only by global root user. * Changed core userns code to support adding per-userns/per-user counters, this is happening in the nsuser_state structure. * Add necessary functionality to inotify to make use of the newly added userns infrastructure. * Moved the initialization of the inotify_max_user_instances/watches to user_namespaces_init so that it's initialised by the time inotify is bootstrapped. Signed-off-by: Nikolay Borisov <kernel@xxxxxxxx> --- fs/notify/inotify/inotify.h | 2 + fs/notify/inotify/inotify_user.c | 93 +++++++++++++++++++++++++++++++++- include/linux/fsnotify_backend.h | 3 ++ include/linux/user_namespace.h | 45 +++++++++++++++++ kernel/user_namespace.c | 106 ++++++++++++++++++++++++++++++++++++++- 5 files changed, 246 insertions(+), 3 deletions(-) diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h index ed855ef6f077..8ead0a1a3cdb 100644 --- a/fs/notify/inotify/inotify.h +++ b/fs/notify/inotify/inotify.h @@ -1,6 +1,8 @@ #include <linux/fsnotify_backend.h> #include <linux/inotify.h> #include <linux/slab.h> /* struct kmem_cache */ +#include <linux/page_counter.h> +#include <linux/user_namespace.h> struct inotify_event_info { struct fsnotify_event fse; diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c index b8d08d0d0a4d..076a9990eff4 100644 --- a/fs/notify/inotify/inotify_user.c +++ b/fs/notify/inotify/inotify_user.c @@ -48,6 +48,8 @@ static int inotify_max_user_instances __read_mostly; static int inotify_max_queued_events __read_mostly; static int inotify_max_user_watches __read_mostly; +int inotify_reserved_user_instances __read_mostly; +int inotify_reserved_user_watches __read_mostly; static struct kmem_cache *inotify_inode_mark_cachep __read_mostly; @@ -82,10 +84,96 @@ struct ctl_table inotify_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = &zero }, + { + .procname = "reserved_user_instances", + .data = &inotify_reserved_user_instances, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, + { + .procname = "reserved_user_watches", + .data = &inotify_reserved_user_watches, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + }, { } }; #endif /* CONFIG_SYSCTL */ +static inline void __init_counters(struct nsuser_state *state, + struct nsuser_state *parent, + struct user_namespace *ns) +{ + if (ns == &init_user_ns) { + page_counter_init(&state->inotify_watches, NULL); + page_counter_init(&state->inotify_instances, NULL); + page_counter_limit(&state->inotify_watches, + init_user_ns.inotify_max_user_watches); + page_counter_limit(&state->inotify_instances, + init_user_ns.inotify_max_user_instances); + } else { + page_counter_init(&state->inotify_watches, + &parent->inotify_watches); + page_counter_init(&state->inotify_instances, + &parent->inotify_instances); + page_counter_limit(&state->inotify_watches, ns->inotify_max_user_watches); + page_counter_limit(&state->inotify_instances, ns->inotify_max_user_instances); + } +} + +static noinline int inotify_init_state(struct user_namespace *ns, kuid_t uid) +{ + struct nsuser_state *state; + struct page_counter *cnt; + + /* We can work with the data without the lock held, since liveliness + * of data is guaranteed as long as the namespace is alive + */ + spin_lock_bh(&nsuser_state_lock); + state = get_nsuser_state(ns, uid); + spin_unlock_bh(&nsuser_state_lock); + + if (!state) { + + state = kzalloc(sizeof(struct nsuser_state), GFP_KERNEL); + if (!state) + return -ENOMEM; + + state->uid = uid; + state->ns = ns; + + if (ns == &init_user_ns) + __init_counters(state, NULL, ns); + else { + struct nsuser_state *parent_state; + + spin_lock_bh(&nsuser_state_lock); + parent_state = get_nsuser_state(ns->parent, ns->owner); + spin_unlock_bh(&nsuser_state_lock); + + BUG_ON(!parent_state); + + __init_counters(state, parent_state, ns); + } + + page_counter_charge(&state->inotify_instances, 1); + + spin_lock_bh(&nsuser_state_lock); + hash_add(nsstate_hash, &state->node, __kuid_val(uid)); + spin_unlock_bh(&nsuser_state_lock); + } else { + if (!page_counter_try_charge(&state->inotify_instances, 1, &cnt)) + return -EMFILE; + } + + return 0; +} + + static inline __u32 inotify_arg_to_mask(u32 arg) { __u32 mask; @@ -819,8 +907,9 @@ static int __init inotify_user_setup(void) inotify_inode_mark_cachep = KMEM_CACHE(inotify_inode_mark, SLAB_PANIC); inotify_max_queued_events = 16384; - inotify_max_user_instances = 128; - inotify_max_user_watches = 8192; + /* These reserves should allow for 8 levels of nesting in userns */ + inotify_reserved_user_instances = 32; + inotify_reserved_user_watches = 1024; return 0; } diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 29f917517299..eb83a10afac7 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -170,6 +170,9 @@ struct fsnotify_group { spinlock_t idr_lock; struct idr idr; struct user_struct *user; + struct user_namespace *userns; + kuid_t uid; /* id in the userns this group is + associated with */ } inotify_data; #endif #ifdef CONFIG_FANOTIFY diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h index 8297e5b341d8..3116a2df1cee 100644 --- a/include/linux/user_namespace.h +++ b/include/linux/user_namespace.h @@ -6,6 +6,9 @@ #include <linux/ns_common.h> #include <linux/sched.h> #include <linux/err.h> +#include <linux/hashtable.h> +#include <linux/spinlock.h> +#include <linux/page_counter.h> #define UID_GID_MAP_MAX_EXTENTS 5 @@ -22,6 +25,21 @@ struct uid_gid_map { /* 64 bytes -- 1 cache line */ #define USERNS_INIT_FLAGS USERNS_SETGROUPS_ALLOWED +#define NSSTATE_HASHTABLE_BITS 10 +extern DECLARE_HASHTABLE(nsstate_hash, NSSTATE_HASHTABLE_BITS); +extern spinlock_t nsuser_state_lock; + +/* Generic struct to hold various peruser/perns state */ +struct nsuser_state { + struct hlist_node node; /* keyed at nstate_hash */ + void *ns; /* ns in which uid is valid */ + kuid_t uid; +#ifdef CONFIG_INOTIFY_USER + struct page_counter inotify_watches; /* How many inotify watches does this user */ + struct page_counter inotify_instances; /* How many inotify devs does this user have opened? */ +#endif +}; + struct user_namespace { struct uid_gid_map uid_map; struct uid_gid_map gid_map; @@ -39,11 +57,28 @@ struct user_namespace { struct key *persistent_keyring_register; struct rw_semaphore persistent_keyring_register_sem; #endif + +#ifdef CONFIG_INOTIFY_USER + int inotify_max_user_instances; + int inotify_max_user_watches; +#endif }; extern struct user_namespace init_user_ns; #ifdef CONFIG_USER_NS +static inline struct nsuser_state *get_nsuser_state(struct user_namespace *ns, + kuid_t uid) +{ + struct nsuser_state *state; + + WARN_ON(!spin_is_locked(&nsuser_state_lock)); + + hash_for_each_possible(nsstate_hash, state, node, __kuid_val(uid)) + if (state->ns == ns && uid_eq(state->uid, uid)) + return state; + return NULL; +} static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { @@ -74,6 +109,16 @@ extern int proc_setgroups_show(struct seq_file *m, void *v); extern bool userns_may_setgroups(const struct user_namespace *ns); #else +static inline struct nsuser_state *get_nsuser_state(struct user_namespace *ns, + kuid_t uid) +{ + struct nsuser_state *state; + hash_for_each_possible(nsstate_hash, state, node, &init_user_ns) + if (uid_eq(uid, state->uid) && state->ns == ns); + return state; + return NULL; +} + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) { return &init_user_ns; diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 9bafc211930c..cb51e3607d2d 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -22,10 +22,20 @@ #include <linux/ctype.h> #include <linux/projid.h> #include <linux/fs_struct.h> +#include <linux/spinlock.h> +#include <linux/kernel.h> static struct kmem_cache *user_ns_cachep __read_mostly; static DEFINE_MUTEX(userns_state_mutex); +DEFINE_HASHTABLE(nsstate_hash, NSSTATE_HASHTABLE_BITS); +DEFINE_SPINLOCK(nsuser_state_lock); + +#ifdef CONFIG_INOTIFY_USER +extern int inotify_reserved_user_instances; +extern int inotify_reserved_user_watches; +#endif + static bool new_idmap_permitted(const struct file *file, struct user_namespace *ns, int cap_setid, struct uid_gid_map *map); @@ -60,10 +70,13 @@ static void set_cred_user_ns(struct cred *cred, struct user_namespace *user_ns) int create_user_ns(struct cred *new) { struct user_namespace *ns, *parent_ns = new->user_ns; + struct nsuser_state *state, *parent_state; kuid_t owner = new->euid; kgid_t group = new->egid; int ret; - +#ifdef CONFIG_INOTIFY_USER + int tmp; +#endif if (parent_ns->level > 32) return -EUSERS; @@ -88,9 +101,16 @@ int create_user_ns(struct cred *new) if (!ns) return -ENOMEM; + state = kmalloc(sizeof(struct nsuser_state), GFP_KERNEL); + if (!state) { + kmem_cache_free(user_ns_cachep, ns); + return -ENOMEM; + } + ret = ns_alloc_inum(&ns->ns); if (ret) { kmem_cache_free(user_ns_cachep, ns); + kfree(state); return ret; } ns->ns.ops = &userns_operations; @@ -101,6 +121,13 @@ int create_user_ns(struct cred *new) ns->level = parent_ns->level + 1; ns->owner = owner; ns->group = group; +#ifdef CONFIG_INOTIFY_USER + tmp = parent_ns->inotify_max_user_instances - inotify_reserved_user_instances; + ns->inotify_max_user_instances = max(0, tmp); + + tmp = parent_ns->inotify_max_user_watches - inotify_reserved_user_watches; + ns->inotify_max_user_watches = max(0, tmp); +#endif /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ mutex_lock(&userns_state_mutex); @@ -112,8 +139,63 @@ int create_user_ns(struct cred *new) #ifdef CONFIG_PERSISTENT_KEYRINGS init_rwsem(&ns->persistent_keyring_register_sem); #endif + + spin_lock_bh(&nsuser_state_lock); + parent_state = get_nsuser_state(parent_ns, owner); + spin_unlock_bh(&nsuser_state_lock); + if (!parent_state) { + struct nsuser_state *grandfather_state; + + spin_lock_bh(&nsuser_state_lock); + /* init_user_ns doesn't have a parent */ + if (parent_ns == &init_user_ns) + grandfather_state = get_nsuser_state(parent_ns, parent_ns->owner); + else + grandfather_state = get_nsuser_state(parent_ns->parent, parent_ns->owner); + spin_unlock_bh(&nsuser_state_lock); + + state->uid = owner; + state->ns = parent_ns; + +#ifdef CONFIG_INOTIFY_USER + page_counter_init(&state->inotify_watches, + &grandfather_state->inotify_watches); + page_counter_init(&state->inotify_instances, + &grandfather_state->inotify_instances); + page_counter_limit(&state->inotify_watches, + parent_ns->inotify_max_user_watches); + page_counter_limit(&state->inotify_instances, + parent_ns->inotify_max_user_instances); +#endif + + spin_lock_bh(&nsuser_state_lock); + hash_add(nsstate_hash, &state->node, __kuid_val(owner)); + spin_unlock_bh(&nsuser_state_lock); + } + return 0; } +/* Delete all state related to a user ns. All processes of a + * namespace should be dead by this time and no references + * to the peruser/perns state variables should be live.As such + * we can be modifying the hashtable without holding the lock + */ +static void free_nsuser_state(struct user_namespace *ns) +{ + int bkt; + struct hlist_node *tmp; + struct nsuser_state *state; + + hash_for_each_safe(nsstate_hash, bkt, tmp, state, node) { + if (state->ns == ns) { + BUG_ON(page_counter_read(&state->inotify_instances)); + BUG_ON(page_counter_read(&state->inotify_watches)); + + hash_del(&state->node); + kfree(state); + } + } +} int unshare_userns(unsigned long unshare_flags, struct cred **new_cred) { @@ -141,6 +223,10 @@ void free_user_ns(struct user_namespace *ns) do { parent = ns->parent; + + spin_lock_bh(&nsuser_state_lock); + free_nsuser_state(ns); + spin_unlock_bh(&nsuser_state_lock); #ifdef CONFIG_PERSISTENT_KEYRINGS key_put(ns->persistent_keyring_register); #endif @@ -1000,7 +1086,25 @@ const struct proc_ns_operations userns_operations = { static __init int user_namespaces_init(void) { + struct nsuser_state *root_state = kmalloc(sizeof(struct nsuser_state), + GFP_KERNEL); + + init_user_ns.inotify_max_user_instances = 256; + init_user_ns.inotify_max_user_watches = 8192; + +#ifdef CONFIG_INOTIFY_USE + page_counter_init(&root_state->inotify_watches, NULL); + page_counter_init(&root_state->inotify_instances, NULL); + page_counter_limit(&root_state->inotify_watches, + init_user_ns.inotify_max_user_watches); + page_counter_limit(&root_state->inotify_instances, + init_user_ns.inotify_max_user_instances); +#endif + root_state->uid = GLOBAL_ROOT_UID; + root_state->ns = &init_user_ns; + hash_add(nsstate_hash, &root_state->node, __kuid_val(GLOBAL_ROOT_UID)); user_ns_cachep = KMEM_CACHE(user_namespace, SLAB_PANIC); + return 0; } subsys_initcall(user_namespaces_init); -- 2.5.0 _______________________________________________ Containers mailing list Containers@xxxxxxxxxxxxxxxxxxxxxxxxxx https://lists.linuxfoundation.org/mailman/listinfo/containers