On 08/04/2015 12:26 AM, Eric W. Biederman wrote: > > This is needed infrastructure for better handling of when files > or directories are moved out from under the root of a bind mount. > > Signed-off-by: "Eric W. Biederman" <ebiederm@xxxxxxxxxxxx> > --- > fs/mount.h | 7 +++ > fs/namespace.c | 120 +++++++++++++++++++++++++++++++++++++++++++++++-- > include/linux/dcache.h | 7 +++ > 3 files changed, 130 insertions(+), 4 deletions(-) > > diff --git a/fs/mount.h b/fs/mount.h > index 14db05d424f7..e8f22970fe59 100644 > --- a/fs/mount.h > +++ b/fs/mount.h > @@ -27,6 +27,12 @@ struct mountpoint { > int m_count; > }; > > +struct mountroot { > + struct hlist_node r_hash; > + struct dentry *r_dentry; > + struct hlist_head r_list; > +}; > + > struct mount { > struct hlist_node mnt_hash; > struct mount *mnt_parent; > @@ -55,6 +61,7 @@ struct mount { > struct mnt_namespace *mnt_ns; /* containing namespace */ > struct mountpoint *mnt_mp; /* where is it mounted */ > struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ > + struct hlist_node mnt_mr_list; /* list mounts with the same mountroot */ > #ifdef CONFIG_FSNOTIFY > struct hlist_head mnt_fsnotify_marks; > __u32 mnt_fsnotify_mask; > diff --git a/fs/namespace.c b/fs/namespace.c > index 2b8aa15fd6df..2ce987af9afa 100644 > --- a/fs/namespace.c > +++ b/fs/namespace.c > @@ -31,6 +31,8 @@ static unsigned int m_hash_mask __read_mostly; > static unsigned int m_hash_shift __read_mostly; > static unsigned int mp_hash_mask __read_mostly; > static unsigned int mp_hash_shift __read_mostly; > +static unsigned int mr_hash_mask __read_mostly; > +static unsigned int mr_hash_shift __read_mostly; > > static __initdata unsigned long mhash_entries; > static int __init set_mhash_entries(char *str) > @@ -52,6 +54,16 @@ static int __init set_mphash_entries(char *str) > } > __setup("mphash_entries=", set_mphash_entries); > > +static __initdata unsigned long mrhash_entries; > +static int __init set_mrhash_entries(char *str) > +{ > + if (!str) > + return 0; > + mrhash_entries = simple_strtoul(str, &str, 0); Nit: Any particular reason for using simple_* rather than kstrto* family of functions? > + return 1; > +} > +__setup("mrhash_entries=", set_mrhash_entries); > + > static u64 event; > static DEFINE_IDA(mnt_id_ida); > static DEFINE_IDA(mnt_group_ida); > @@ -61,6 +73,7 @@ static int mnt_group_start = 1; > > static struct hlist_head *mount_hashtable __read_mostly; > static struct hlist_head *mountpoint_hashtable __read_mostly; > +static struct hlist_head *mountroot_hashtable __read_mostly; > static struct kmem_cache *mnt_cache __read_mostly; > static DECLARE_RWSEM(namespace_sem); > > @@ -93,6 +106,13 @@ static inline struct hlist_head *mp_hash(struct dentry *dentry) > return &mountpoint_hashtable[tmp & mp_hash_mask]; > } > > +static inline struct hlist_head *mr_hash(struct dentry *dentry) > +{ > + unsigned long tmp = ((unsigned long)dentry / L1_CACHE_BYTES); > + tmp = tmp + (tmp >> mr_hash_shift); > + return &mountroot_hashtable[tmp & mr_hash_mask]; > +} > + > /* > * allocation is serialized by namespace_sem, but we need the spinlock to > * serialize with freeing. > @@ -234,6 +254,7 @@ static struct mount *alloc_vfsmnt(const char *name) > INIT_LIST_HEAD(&mnt->mnt_slave_list); > INIT_LIST_HEAD(&mnt->mnt_slave); > INIT_HLIST_NODE(&mnt->mnt_mp_list); > + INIT_HLIST_NODE(&mnt->mnt_mr_list); > #ifdef CONFIG_FSNOTIFY > INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); > #endif > @@ -779,6 +800,77 @@ static void put_mountpoint(struct mountpoint *mp) > } > } > > +static struct mountroot *lookup_mountroot(struct dentry *dentry) > +{ > + struct hlist_head *chain = mr_hash(dentry); > + struct mountroot *mr; > + > + hlist_for_each_entry(mr, chain, r_hash) { > + if (mr->r_dentry == dentry) > + return mr; > + } > + return NULL; > +} > + > +static int mnt_set_root(struct mount *mnt, struct dentry *root) > +{ > + struct mountroot *mr = NULL; > + > + read_seqlock_excl(&mount_lock); > + if (d_mountroot(root)) > + mr = lookup_mountroot(root); > + if (!mr) { > + struct mountroot *new; > + read_sequnlock_excl(&mount_lock); > + > + new = kmalloc(sizeof(struct mountroot), GFP_KERNEL); > + if (!new) > + return -ENOMEM; > + > + read_seqlock_excl(&mount_lock); > + mr = lookup_mountroot(root); > + if (mr) { > + kfree(new); > + } else { > + struct hlist_head *chain = mr_hash(root); > + > + mr = new; > + mr->r_dentry = root; > + INIT_HLIST_HEAD(&mr->r_list); > + hlist_add_head(&mr->r_hash, chain); > + > + spin_lock(&root->d_lock); > + root->d_flags |= DCACHE_MOUNTROOT; > + spin_unlock(&root->d_lock); > + } > + } > + mnt->mnt.mnt_root = root; > + hlist_add_head(&mnt->mnt_mr_list, &mr->r_list); > + read_sequnlock_excl(&mount_lock); > + > + return 0; > +} > + > +static void mnt_put_root(struct mount *mnt) > +{ > + struct dentry *root = mnt->mnt.mnt_root; > + struct mountroot *mr; > + > + read_seqlock_excl(&mount_lock); > + mr = lookup_mountroot(root); > + BUG_ON(!mr); > + hlist_del(&mnt->mnt_mr_list); > + if (hlist_empty(&mr->r_list)) { > + hlist_del(&mr->r_hash); > + spin_lock(&root->d_lock); > + root->d_flags &= ~DCACHE_MOUNTROOT; > + spin_unlock(&root->d_lock); > + kfree(mr); > + } > + read_sequnlock_excl(&mount_lock); > + dput(root); > +} > + > static inline int check_mnt(struct mount *mnt) > { > return mnt->mnt_ns == current->nsproxy->mnt_ns; > @@ -934,6 +1026,7 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void > { > struct mount *mnt; > struct dentry *root; > + int err; > > if (!type) > return ERR_PTR(-ENODEV); > @@ -952,8 +1045,16 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void > return ERR_CAST(root); > } > > - mnt->mnt.mnt_root = root; > mnt->mnt.mnt_sb = root->d_sb; > + err = mnt_set_root(mnt, root); > + if (err) { > + dput(root); > + deactivate_super(mnt->mnt.mnt_sb); > + mnt_free_id(mnt); > + free_vfsmnt(mnt); > + return ERR_PTR(err); > + } > + > mnt->mnt_mountpoint = mnt->mnt.mnt_root; > mnt->mnt_parent = mnt; > lock_mount_hash(); > @@ -985,6 +1086,10 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, > goto out_free; > } > > + err = mnt_set_root(mnt, root); > + if (err) > + goto out_free; > + > mnt->mnt.mnt_flags = old->mnt.mnt_flags & ~(MNT_WRITE_HOLD|MNT_MARKED); > /* Don't allow unprivileged users to change mount flags */ > if (flag & CL_UNPRIVILEGED) { > @@ -1010,7 +1115,7 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, > > atomic_inc(&sb->s_active); > mnt->mnt.mnt_sb = sb; > - mnt->mnt.mnt_root = dget(root); > + dget(root); > mnt->mnt_mountpoint = mnt->mnt.mnt_root; > mnt->mnt_parent = mnt; > lock_mount_hash(); > @@ -1063,7 +1168,7 @@ static void cleanup_mnt(struct mount *mnt) > if (unlikely(mnt->mnt_pins.first)) > mnt_pin_kill(mnt); > fsnotify_vfsmount_delete(&mnt->mnt); > - dput(mnt->mnt.mnt_root); > + mnt_put_root(mnt); > deactivate_super(mnt->mnt.mnt_sb); > mnt_free_id(mnt); > call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); > @@ -3120,14 +3225,21 @@ void __init mnt_init(void) > mphash_entries, 19, > 0, > &mp_hash_shift, &mp_hash_mask, 0, 0); > + mountroot_hashtable = alloc_large_system_hash("Mountroot-cache", > + sizeof(struct hlist_head), > + mrhash_entries, 19, > + 0, > + &mr_hash_shift, &mr_hash_mask, 0, 0); > > - if (!mount_hashtable || !mountpoint_hashtable) > + if (!mount_hashtable || !mountpoint_hashtable || !mountroot_hashtable) > panic("Failed to allocate mount hash table\n"); > > for (u = 0; u <= m_hash_mask; u++) > INIT_HLIST_HEAD(&mount_hashtable[u]); > for (u = 0; u <= mp_hash_mask; u++) > INIT_HLIST_HEAD(&mountpoint_hashtable[u]); > + for (u = 0; u <= mr_hash_mask; u++) > + INIT_HLIST_HEAD(&mountroot_hashtable[u]); > > kernfs_init(); > > diff --git a/include/linux/dcache.h b/include/linux/dcache.h > index d67ae119cf4e..52a5e6915f58 100644 > --- a/include/linux/dcache.h > +++ b/include/linux/dcache.h > @@ -228,6 +228,8 @@ struct dentry_operations { > #define DCACHE_FALLTHRU 0x01000000 /* Fall through to lower layer */ > #define DCACHE_OP_SELECT_INODE 0x02000000 /* Unioned entry: dcache op selects inode */ > > +#define DCACHE_MOUNTROOT 0x04000000 /* Root of a vfsmount */ > + > extern seqlock_t rename_lock; > > /* > @@ -404,6 +406,11 @@ static inline bool d_mountpoint(const struct dentry *dentry) > return dentry->d_flags & DCACHE_MOUNTED; > } > > +static inline bool d_mountroot(const struct dentry *dentry) > +{ > + return dentry->d_flags & DCACHE_MOUNTROOT; > +} > + > /* > * Directory cache entry type accessor functions. > */ > -- To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html