On Tue, Dec 01, 2015 at 12:21:31AM +0100, Nicolai Stange wrote: > Nothing prevents a dentry found by path lookup before a return of > __debugfs_remove() to actually get opened after that return. Now, after > the return of __debugfs_remove(), there are no guarantees whatsoever > regarding the memory the corresponding inode's file_operations object > had been kept in. > > Since __debugfs_remove() is seldomly invoked, usually from module exit > handlers only, the race is hard to trigger and the impact is very low. > > A discussion of the problem outlined above as well as a suggested > solution can be found in the (sub-)thread rooted at > > http://lkml.kernel.org/g/20130401203445.GA20862@xxxxxxxxxxxxxxxxxx > ("Yet another pipe related oops.") > > Basically, Greg KH suggests to introduce an intermediate fops and > Al Viro points out that a pointer to the original ones may be stored in > ->d_fsdata. > > Follow this line of reasoning: > - Add SRCU as a reverse dependency of DEBUG_FS. > - Introduce a srcu_struct object for the debugfs subsystem. > - In debugfs_create_file(), store a pointer to the original > file_operations object in ->d_fsdata. > - In __debugfs_remove(), clear out that dentry->d_fsdata member again. > debugfs_remove() and debugfs_remove_recursive() wait for a SRCU grace > period before returning to their caller. > - Introduce an intermediate file_operations object named > "debugfs_proxy_file_operations". It's ->open() functions checks, > under the protection of a SRCU read lock, whether the "original" > file_operations pointed to by ->d_fsdata is still valid and if so, > tries to acquire a reference on the owning module. On success, it sets > the file object's ->f_op to the original file_operations and forwards > the ongoing open() call to the original ->open(). > - For clarity, rename the former debugfs_file_operations to > debugfs_noop_file_operations -- they are in no way canonical. > > The choice of SRCU over "normal" RCU is justified by the fact, that the > former may also be used to protect ->i_private data from going away > during the execution of a file's readers and writers which may (and do) > sleep. > > Signed-off-by: Nicolai Stange <nicstange@xxxxxxxxx> One question below. Other than that, looks good from an RCU perspective. Thanx, Paul > --- > Applicable to the Linus tree. > The second of the two patches depends on the first one > > fs/debugfs/file.c | 29 ++++++++++++++++++++++++++++- > fs/debugfs/inode.c | 16 +++++++++++++--- > include/linux/debugfs.h | 6 +++++- > lib/Kconfig.debug | 1 + > 4 files changed, 47 insertions(+), 5 deletions(-) > > diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c > index d2ba12e..67df2c9 100644 > --- a/fs/debugfs/file.c > +++ b/fs/debugfs/file.c > @@ -35,13 +35,40 @@ static ssize_t default_write_file(struct file *file, const char __user *buf, > return count; > } > > -const struct file_operations debugfs_file_operations = { > +const struct file_operations debugfs_noop_file_operations = { > .read = default_read_file, > .write = default_write_file, > .open = simple_open, > .llseek = noop_llseek, > }; > > +static int proxy_open(struct inode *inode, struct file *filp) > +{ > + struct dentry * const dentry = filp->f_path.dentry; > + const struct file_operations __rcu *rcu_real_fops; > + const struct file_operations *real_fops; > + int srcu_idx; > + > + srcu_idx = srcu_read_lock(&debugfs_srcu); > + rcu_real_fops = (const struct file_operations __rcu *)dentry->d_fsdata; > + real_fops = srcu_dereference(rcu_real_fops, &debugfs_srcu); > + real_fops = fops_get(real_fops); > + srcu_read_unlock(&debugfs_srcu, srcu_idx); > + > + if (!real_fops) > + return -ENOENT; > + > + replace_fops(filp, real_fops); > + if (filp->f_op->open) > + return filp->f_op->open(inode, filp); > + > + return 0; > +} > + > +const struct file_operations debugfs_proxy_file_operations = { > + .open = proxy_open, > +}; > + > static struct dentry *debugfs_create_mode(const char *name, umode_t mode, > struct dentry *parent, void *value, > const struct file_operations *fops, > diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c > index b7fcc0d..8ae2e1a 100644 > --- a/fs/debugfs/inode.c > +++ b/fs/debugfs/inode.c > @@ -30,6 +30,8 @@ > > #define DEBUGFS_DEFAULT_MODE 0700 > > +DEFINE_SRCU(debugfs_srcu); > + > static struct vfsmount *debugfs_mount; > static int debugfs_mount_count; > static bool debugfs_registered; > @@ -340,8 +342,12 @@ struct dentry *debugfs_create_file(const char *name, umode_t mode, > return failed_creating(dentry); > > inode->i_mode = mode; > - inode->i_fop = fops ? fops : &debugfs_file_operations; > inode->i_private = data; > + > + inode->i_fop = fops ? &debugfs_proxy_file_operations > + : &debugfs_noop_file_operations; > + rcu_assign_pointer(dentry->d_fsdata, (void *)fops); > + > d_instantiate(dentry, inode); > fsnotify_create(d_inode(dentry->d_parent), dentry); > return end_creating(dentry); > @@ -523,10 +529,12 @@ static int __debugfs_remove(struct dentry *dentry, struct dentry *parent) > > if (simple_positive(dentry)) { > dget(dentry); > - if (d_is_dir(dentry)) > + if (d_is_dir(dentry)) { > ret = simple_rmdir(d_inode(parent), dentry); > - else > + } else { > simple_unlink(d_inode(parent), dentry); > + rcu_assign_pointer(dentry->d_fsdata, NULL); > + } > if (!ret) > d_delete(dentry); > dput(dentry); > @@ -565,6 +573,7 @@ void debugfs_remove(struct dentry *dentry) > mutex_unlock(&d_inode(parent)->i_mutex); > if (!ret) > simple_release_fs(&debugfs_mount, &debugfs_mount_count); > + synchronize_srcu(&debugfs_srcu); So the reason that this is OK from a performance perspective is that people removing large quantities of debugfs entries are supposed to use debugfs_remove_recursive()? If there are valid use cases with lots of debugfs_remove() calls in quick succession, and if this results in excessive latency due to lots of synchronize_srcu() calls, then one approach would be to have a debugfs_remove_nowait() or some such that omitted the final synchronize_srcu(). The last call could use debugfs_remove() to clean things up. There are of course a number of alternative approaches. But if there is no performance issue with real workloads, then there is of course no need to do anything. > } > EXPORT_SYMBOL_GPL(debugfs_remove); > > @@ -642,6 +651,7 @@ void debugfs_remove_recursive(struct dentry *dentry) > if (!__debugfs_remove(child, parent)) > simple_release_fs(&debugfs_mount, &debugfs_mount_count); > mutex_unlock(&d_inode(parent)->i_mutex); > + synchronize_srcu(&debugfs_srcu); > } > EXPORT_SYMBOL_GPL(debugfs_remove_recursive); > > diff --git a/include/linux/debugfs.h b/include/linux/debugfs.h > index 19c066d..f8c7494 100644 > --- a/include/linux/debugfs.h > +++ b/include/linux/debugfs.h > @@ -19,6 +19,7 @@ > #include <linux/seq_file.h> > > #include <linux/types.h> > +#include <linux/srcu.h> > > struct device; > struct file_operations; > @@ -44,7 +45,10 @@ extern struct dentry *arch_debugfs_dir; > #if defined(CONFIG_DEBUG_FS) > > /* declared over in file.c */ > -extern const struct file_operations debugfs_file_operations; > +extern const struct file_operations debugfs_noop_file_operations; > +extern const struct file_operations debugfs_proxy_file_operations; > + > +extern struct srcu_struct debugfs_srcu; > > struct dentry *debugfs_create_file(const char *name, umode_t mode, > struct dentry *parent, void *data, > diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug > index 8c15b29..3929878 100644 > --- a/lib/Kconfig.debug > +++ b/lib/Kconfig.debug > @@ -257,6 +257,7 @@ config PAGE_OWNER > > config DEBUG_FS > bool "Debug Filesystem" > + select SRCU > help > debugfs is a virtual file system that kernel developers use to put > debugging files into. Enable this option to be able to read and > -- > 2.6.3 > -- To unsubscribe from this list: send the line "unsubscribe linux-doc" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html