These "_shared" versions of various inode operations are not guaranteed an exclusive lock on the directory but are guaranteed an exclusive lock on the dentry within the directory. i_rwsem *may* be held exclusively or *may* be held shared, in which case an exclusive lock will be held on the dentry - provided by a later patch. This will allow a graceful transition from exclusive to shared locking for directory updates. mkdir_shared is a bit different as it optionally returns a new dentry for cases when the filesystem is not able to use the original dentry. This allows vfs_mkdir_return() to avoid the need for an extra lookup. Signed-off-by: NeilBrown <neilb@xxxxxxx> --- Documentation/filesystems/locking.rst | 28 ++++++- Documentation/filesystems/porting.rst | 10 +++ Documentation/filesystems/vfs.rst | 24 ++++++ fs/namei.c | 108 +++++++++++++++++++------- include/linux/fs.h | 16 ++++ 5 files changed, 158 insertions(+), 28 deletions(-) diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index f5e3676db954..7cacff59356f 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -57,15 +57,24 @@ inode_operations prototypes:: int (*create) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t, bool); + int (*create_shared) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t, bool); struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); int (*link) (struct dentry *,struct inode *,struct dentry *); + int (*link_shared) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); + int (*unlink_shared) (struct inode *,struct dentry *); int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *,const char *); + int (*symlink_shared) (struct mnt_idmap *, struct inode *,struct dentry *,const char *); int (*mkdir) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t); + struct dentry * (*mkdir_shared) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t); int (*rmdir) (struct inode *,struct dentry *); + int (*rmdir_shared) (struct inode *,struct dentry *); int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t,dev_t); + int (*mknod_shared) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t,dev_t); int (*rename) (struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); + int (*rename_shared) (struct mnt_idmap *, struct inode *, struct dentry *, + struct inode *, struct dentry *, unsigned int); int (*readlink) (struct dentry *, char __user *,int); const char *(*get_link) (struct dentry *, struct inode *, struct delayed_call *); void (*truncate) (struct inode *); @@ -79,6 +88,9 @@ prototypes:: int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode); + int (*atomic_open_shared)(struct inode *, struct dentry *, + struct file *, unsigned open_flag, + umode_t create_mode); int (*tmpfile) (struct mnt_idmap *, struct inode *, struct file *, umode_t); int (*fileattr_set)(struct mnt_idmap *idmap, @@ -90,18 +102,29 @@ prototypes:: locking rules: all may block +A "mixed" lock means that either that i_rwsem on the directory is held +exclusively, or it is held as a shared lock, and an exclusive lock is held +on the dentry in that directory. ============== ================================================== ops i_rwsem(inode) ============== ================================================== lookup: shared create: exclusive +create_shared: mixed link: exclusive (both) +link_shared: exclusive on source, mixed on target mknod: exclusive +mknod_shared: mixed symlink: exclusive +symlink_shared: mixed mkdir: exclusive +mkdir_shared: mixed unlink: exclusive (both) +unlink_shared: exclusive on object, mixed on directory/name rmdir: exclusive (both)(see below) +rmdir_shared: exclusive on object, mixed on directory/name (see below) rename: exclusive (both parents, some children) (see below) +rename_shared: mixed (both parents) exclusive (some children) (see below) readlink: no get_link: no setattr: exclusive @@ -113,6 +136,7 @@ listxattr: no fiemap: no update_time: no atomic_open: shared (exclusive if O_CREAT is set in open flags) +atomic_open_shared: mixed (if O_CREAT is not set, then may not have exclusive lock on name) tmpfile: no fileattr_get: no or exclusive fileattr_set: exclusive @@ -120,8 +144,8 @@ get_offset_ctx no ============== ================================================== - Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_rwsem - exclusive on victim. + Additionally, ->rmdir(), ->unlink() and ->rename(), as well as _shared + versions, have ->i_rwsem exclusive on victim. cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem. ->unlink() and ->rename() have ->i_rwsem exclusive on all non-directories involved. diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 9ab2a3d6f2b4..c7f3825f280c 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1141,3 +1141,13 @@ pointer are gone. set_blocksize() takes opened struct file instead of struct block_device now and it *must* be opened exclusive. + +--- + +**recommended** + +create_shared, link_shared, unlink_shared, rmdir_shared, mknod_shared, +rename_shared, atomic_open_shared can be provided instead of the +corresponding inode_operations with the "_shared" suffix. Multiple +_shared operations can be performed in a given directory concurrently, +but never on the same name. diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 0b18af3f954e..c4860597975a 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -491,15 +491,24 @@ As of kernel 2.6.22, the following members are defined: struct inode_operations { int (*create) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t, bool); + int (*create_shared) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t, bool); struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int); int (*link) (struct dentry *,struct inode *,struct dentry *); + int (*link_shared) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); + int (*unlink_shared) (struct inode *,struct dentry *); int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *,const char *); + int (*symlink_shared) (struct mnt_idmap *, struct inode *,struct dentry *,const char *); int (*mkdir) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t); + struct dentry * (*mkdir_shared) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t); int (*rmdir) (struct inode *,struct dentry *); + int (*rmdir_shared) (struct inode *,struct dentry *); int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t,dev_t); + int (*mknod_shared) (struct mnt_idmap *, struct inode *,struct dentry *,umode_t,dev_t); int (*rename) (struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); + int (*rename_shared) (struct mnt_idmap *, struct inode *, struct dentry *, + struct inode *, struct dentry *, unsigned int); int (*readlink) (struct dentry *, char __user *,int); const char *(*get_link) (struct dentry *, struct inode *, struct delayed_call *); @@ -511,6 +520,8 @@ As of kernel 2.6.22, the following members are defined: void (*update_time)(struct inode *, struct timespec *, int); int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode); + int (*atomic_open_shared)(struct inode *, struct dentry *, struct file *, + unsigned open_flag, umode_t create_mode); int (*tmpfile) (struct mnt_idmap *, struct inode *, struct file *, umode_t); struct posix_acl * (*get_acl)(struct mnt_idmap *, struct dentry *, int); int (*set_acl)(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); @@ -524,6 +535,7 @@ Again, all methods are called without any locks being held, unless otherwise noted. ``create`` +``create_shared`` called by the open(2) and creat(2) system calls. Only required if you want to support regular files. The dentry you get should not have an inode (i.e. it should be a negative dentry). Here @@ -546,29 +558,39 @@ otherwise noted. directory inode semaphore held ``link`` +``link_shared`` called by the link(2) system call. Only required if you want to support hard links. You will probably need to call d_instantiate() just as you would in the create() method ``unlink`` +``unlink_shared`` called by the unlink(2) system call. Only required if you want to support deleting inodes ``symlink`` +``symlink_shared`` called by the symlink(2) system call. Only required if you want to support symlinks. You will probably need to call d_instantiate() just as you would in the create() method ``mkdir`` +``mkdir_shared`` called by the mkdir(2) system call. Only required if you want to support creating subdirectories. You will probably need to call d_instantiate() just as you would in the create() method + mkdir_shared can return an alternate dentry, much like lookup. + In this case the original dentry will still be negative and will + be unhashed. + ``rmdir`` +``rmdir_shared`` called by the rmdir(2) system call. Only required if you want to support deleting subdirectories ``mknod`` +``mknod_shared`` called by the mknod(2) system call to create a device (char, block) inode or a named pipe (FIFO) or socket. Only required if you want to support creating these types of inodes. You will @@ -576,6 +598,7 @@ otherwise noted. create() method ``rename`` +``rename_shared`` called by the rename(2) system call to rename the object to have the parent and name given by the second inode and dentry. @@ -647,6 +670,7 @@ otherwise noted. itself and call mark_inode_dirty_sync. ``atomic_open`` +``atomic_open_shared`` called on the last component of an open. Using this optional method the filesystem can look up, possibly create and open the file in one atomic operation. If it wants to leave actual diff --git a/fs/namei.c b/fs/namei.c index cdd1fc9d56a0..65082378dc60 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -3338,14 +3338,17 @@ int vfs_create(struct mnt_idmap *idmap, struct inode *dir, if (error) return error; - if (!dir->i_op->create) + if (!dir->i_op->create && !dir->i_op->create_shared) return -EACCES; /* shouldn't it be ENOSYS? */ mode = vfs_prepare_mode(idmap, dir, mode, S_IALLUGO, S_IFREG); error = security_inode_create(dir, dentry, mode); if (error) return error; - error = dir->i_op->create(idmap, dir, dentry, mode, want_excl); + if (dir->i_op->create_shared) + error = dir->i_op->create_shared(idmap, dir, dentry, mode, want_excl); + else + error = dir->i_op->create(idmap, dir, dentry, mode, want_excl); if (!error) fsnotify_create(dir, dentry); return error; @@ -3506,8 +3509,12 @@ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, file->f_path.dentry = DENTRY_NOT_SET; file->f_path.mnt = nd->path.mnt; - error = dir->i_op->atomic_open(dir, dentry, file, - open_to_namei_flags(open_flag), mode); + if (dir->i_op->atomic_open_shared) + error = dir->i_op->atomic_open_shared(dir, dentry, file, + open_to_namei_flags(open_flag), mode); + else + error = dir->i_op->atomic_open(dir, dentry, file, + open_to_namei_flags(open_flag), mode); d_lookup_done(dentry); if (!error) { if (file->f_mode & FMODE_OPENED) { @@ -3616,7 +3623,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, } if (create_error) open_flag &= ~O_CREAT; - if (dir_inode->i_op->atomic_open) { + if (dir_inode->i_op->atomic_open || dir_inode->i_op->atomic_open_shared) { dentry = atomic_open(nd, dentry, file, open_flag, mode); if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT)) dentry = ERR_PTR(create_error); @@ -3641,13 +3648,17 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, if (!dentry->d_inode && (open_flag & O_CREAT)) { file->f_mode |= FMODE_CREATED; audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE); - if (!dir_inode->i_op->create) { - error = -EACCES; - goto out_dput; - } - error = dir_inode->i_op->create(idmap, dir_inode, dentry, - mode, open_flag & O_EXCL); + if (dir_inode->i_op->create_shared) + error = dir_inode->i_op->create_shared(idmap, dir_inode, + dentry, mode, + open_flag & O_EXCL); + else if (dir_inode->i_op->create) + error = dir_inode->i_op->create(idmap, dir_inode, + dentry, mode, + open_flag & O_EXCL); + else + error = -EACCES; if (error) goto out_dput; } @@ -4174,7 +4185,7 @@ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, !capable(CAP_MKNOD)) return -EPERM; - if (!dir->i_op->mknod) + if (!dir->i_op->mknod && !dir->i_op->mknod_shared) return -EPERM; mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); @@ -4186,7 +4197,10 @@ int vfs_mknod(struct mnt_idmap *idmap, struct inode *dir, if (error) return error; - error = dir->i_op->mknod(idmap, dir, dentry, mode, dev); + if (dir->i_op->mknod_shared) + error = dir->i_op->mknod_shared(idmap, dir, dentry, mode, dev); + else + error = dir->i_op->mknod(idmap, dir, dentry, mode, dev); if (!error) fsnotify_create(dir, dentry); return error; @@ -4297,7 +4311,7 @@ int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, if (error) return error; - if (!dir->i_op->mkdir) + if (!dir->i_op->mkdir && !dir->i_op->mkdir_shared) return -EPERM; mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0); @@ -4308,7 +4322,16 @@ int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, if (max_links && dir->i_nlink >= max_links) return -EMLINK; - error = dir->i_op->mkdir(idmap, dir, dentry, mode); + if (dir->i_op->mkdir_shared) { + struct dentry *de; + de = dir->i_op->mkdir_shared(idmap, dir, dentry, mode); + if (IS_ERR(de)) + error = PTR_ERR(de); + else if (de) + dput(de); + } else { + error = dir->i_op->mkdir(idmap, dir, dentry, mode); + } if (!error) fsnotify_mkdir(dir, dentry); return error; @@ -4356,6 +4379,20 @@ int vfs_mkdir_return(struct mnt_idmap *idmap, struct inode *dir, if (max_links && dir->i_nlink >= max_links) return -EMLINK; + if (dir->i_op->mkdir_shared) { + struct dentry *de; + + de = dir->i_op->mkdir_shared(idmap, dir, dentry, mode); + if (IS_ERR(de)) + return PTR_ERR(de); + if (de) { + dput(dentry); + *dentryp = de; + } + fsnotify_mkdir(dir, dentry); + return 0; + } + error = dir->i_op->mkdir(idmap, dir, dentry, mode); if (!error) { fsnotify_mkdir(dir, dentry); @@ -4439,7 +4476,7 @@ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir, if (error) return error; - if (!dir->i_op->rmdir) + if (!dir->i_op->rmdir && !dir->i_op->rmdir_shared) return -EPERM; dget(dentry); @@ -4454,7 +4491,10 @@ int vfs_rmdir(struct mnt_idmap *idmap, struct inode *dir, if (error) goto out; - error = dir->i_op->rmdir(dir, dentry); + if (dir->i_op->rmdir_shared) + error = dir->i_op->rmdir_shared(dir, dentry); + else + error = dir->i_op->rmdir(dir, dentry); if (error) goto out; @@ -4569,7 +4609,7 @@ int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir, if (error) return error; - if (!dir->i_op->unlink) + if (!dir->i_op->unlink && !dir->i_op->unlink_shared) return -EPERM; inode_lock(target); @@ -4583,7 +4623,10 @@ int vfs_unlink(struct mnt_idmap *idmap, struct inode *dir, error = try_break_deleg(target, delegated_inode); if (error) goto out; - error = dir->i_op->unlink(dir, dentry); + if (dir->i_op->unlink_shared) + error = dir->i_op->unlink_shared(dir, dentry); + else + error = dir->i_op->unlink(dir, dentry); if (!error) { dont_mount(dentry); detach_mounts(dentry); @@ -4722,14 +4765,17 @@ int vfs_symlink(struct mnt_idmap *idmap, struct inode *dir, if (error) return error; - if (!dir->i_op->symlink) + if (!dir->i_op->symlink && !dir->i_op->symlink_shared) return -EPERM; error = security_inode_symlink(dir, dentry, oldname); if (error) return error; - error = dir->i_op->symlink(idmap, dir, dentry, oldname); + if (dir->i_op->symlink_shared) + error = dir->i_op->symlink_shared(idmap, dir, dentry, oldname); + else + error = dir->i_op->symlink(idmap, dir, dentry, oldname); if (!error) fsnotify_create(dir, dentry); return error; @@ -4835,7 +4881,7 @@ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, */ if (HAS_UNMAPPED_ID(idmap, inode)) return -EPERM; - if (!dir->i_op->link) + if (!dir->i_op->link && !dir->i_op->link_shared) return -EPERM; if (S_ISDIR(inode->i_mode)) return -EPERM; @@ -4852,7 +4898,11 @@ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, error = -EMLINK; else { error = try_break_deleg(inode, delegated_inode); - if (!error) + if (error) + ; + else if (dir->i_op->link_shared) + error = dir->i_op->link_shared(old_dentry, dir, new_dentry); + else error = dir->i_op->link(old_dentry, dir, new_dentry); } @@ -5044,7 +5094,7 @@ int vfs_rename(struct renamedata *rd) if (error) return error; - if (!old_dir->i_op->rename) + if (!old_dir->i_op->rename && !old_dir->i_op->rename_shared) return -EPERM; /* @@ -5127,8 +5177,14 @@ int vfs_rename(struct renamedata *rd) if (error) goto out; } - error = old_dir->i_op->rename(rd->new_mnt_idmap, old_dir, old_dentry, - new_dir, new_dentry, flags); + if (old_dir->i_op->rename_shared) + error = old_dir->i_op->rename_shared(rd->new_mnt_idmap, + old_dir, old_dentry, + new_dir, new_dentry, flags); + else + error = old_dir->i_op->rename(rd->new_mnt_idmap, + old_dir, old_dentry, + new_dir, new_dentry, flags); if (error) goto out; diff --git a/include/linux/fs.h b/include/linux/fs.h index 406887d0394e..68eba181175b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2147,17 +2147,30 @@ struct inode_operations { int (*create) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t, bool); + int (*create_shared) (struct mnt_idmap *, struct inode *,struct dentry *, + umode_t, bool); int (*link) (struct dentry *,struct inode *,struct dentry *); + int (*link_shared) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); + int (*unlink_shared) (struct inode *,struct dentry *); int (*symlink) (struct mnt_idmap *, struct inode *,struct dentry *, const char *); + int (*symlink_shared) (struct mnt_idmap *, struct inode *,struct dentry *, + const char *); int (*mkdir) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t); + struct dentry * (*mkdir_shared) (struct mnt_idmap *, struct inode *,struct dentry *, + umode_t); int (*rmdir) (struct inode *,struct dentry *); + int (*rmdir_shared) (struct inode *,struct dentry *); int (*mknod) (struct mnt_idmap *, struct inode *,struct dentry *, umode_t,dev_t); + int (*mknod_shared) (struct mnt_idmap *, struct inode *,struct dentry *, + umode_t,dev_t); int (*rename) (struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, unsigned int); + int (*rename_shared) (struct mnt_idmap *, struct inode *, struct dentry *, + struct inode *, struct dentry *, unsigned int); int (*setattr) (struct mnt_idmap *, struct dentry *, struct iattr *); int (*getattr) (struct mnt_idmap *, const struct path *, struct kstat *, u32, unsigned int); @@ -2168,6 +2181,9 @@ struct inode_operations { int (*atomic_open)(struct inode *, struct dentry *, struct file *, unsigned open_flag, umode_t create_mode); + int (*atomic_open_shared)(struct inode *, struct dentry *, + struct file *, unsigned open_flag, + umode_t create_mode); int (*tmpfile) (struct mnt_idmap *, struct inode *, struct file *, umode_t); struct posix_acl *(*get_acl)(struct mnt_idmap *, struct dentry *, -- 2.47.0