Re: [viro-vfs:work.dcache2] [__dentry_kill()] 1b738f196e: stress-ng.sysinfo.ops_per_sec -27.2% regression

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Wed, Dec 06, 2023 at 06:24:29PM +0100, Mateusz Guzik wrote:
> On 12/6/23, Al Viro <viro@xxxxxxxxxxxxxxxxxx> wrote:
> > On Wed, Dec 06, 2023 at 05:42:34PM +0100, Mateusz Guzik wrote:
> >
> >> That is to say your patchset is probably an improvement, but this
> >> benchmark uses kernfs which is a total crapper, with code like this in
> >> kernfs_iop_permission:
> >>
> >>         root = kernfs_root(kn);
> >>
> >>         down_read(&root->kernfs_iattr_rwsem);
> >>         kernfs_refresh_inode(kn, inode);
> >>         ret = generic_permission(&nop_mnt_idmap, inode, mask);
> >>         up_read(&root->kernfs_iattr_rwsem);
> >>
> >>
> >> Maybe there is an easy way to dodge this, off hand I don't see one.
> >
> > At a guess - seqcount on kernfs nodes, bumped on metadata changes
> > and a seqretry loop, not that this was the only problem with kernfs
> > scalability.
> >
> 
> I assumed you can't have possibly changing inode fields around
> generic_permission.

That's not the problem; kernfs_refresh_inode(), OTOH, is.
Locking in kernfs is really atrocious ;-/

I would prefer to make that thing per-node (and not an rwsem, obviously,
seqloct_t would suffice), but let's see what the minimal change would do
- turn that into a mutex + seqcount and keep them in the same place.

Below is completely untested, just to see if it would affect the sysinfo
side of things (both with and without dcache series - it's independent
from that):

diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 8b2bd65d70e7..2784ac117a1f 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -383,11 +383,13 @@ static int kernfs_link_sibling(struct kernfs_node *kn)
 	rb_insert_color(&kn->rb, &kn->parent->dir.children);
 
 	/* successfully added, account subdir number */
-	down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
+	mutex_lock(&kernfs_root(kn)->kernfs_iattr_lock);
+	write_seqcount_begin(&kernfs_root(kn)->kernfs_iattr_seq);
 	if (kernfs_type(kn) == KERNFS_DIR)
 		kn->parent->dir.subdirs++;
 	kernfs_inc_rev(kn->parent);
-	up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
+	write_seqcount_end(&kernfs_root(kn)->kernfs_iattr_seq);
+	mutex_unlock(&kernfs_root(kn)->kernfs_iattr_lock);
 
 	return 0;
 }
@@ -410,11 +412,12 @@ static bool kernfs_unlink_sibling(struct kernfs_node *kn)
 	if (RB_EMPTY_NODE(&kn->rb))
 		return false;
 
-	down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
+	mutex_lock(&kernfs_root(kn)->kernfs_iattr_lock);
+	write_seqcount_begin(&kernfs_root(kn)->kernfs_iattr_seq);
 	if (kernfs_type(kn) == KERNFS_DIR)
 		kn->parent->dir.subdirs--;
-	kernfs_inc_rev(kn->parent);
-	up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
+	write_seqcount_end(&kernfs_root(kn)->kernfs_iattr_seq);
+	mutex_unlock(&kernfs_root(kn)->kernfs_iattr_lock);
 
 	rb_erase(&kn->rb, &kn->parent->dir.children);
 	RB_CLEAR_NODE(&kn->rb);
@@ -639,15 +642,11 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
 	kn->flags = flags;
 
 	if (!uid_eq(uid, GLOBAL_ROOT_UID) || !gid_eq(gid, GLOBAL_ROOT_GID)) {
-		struct iattr iattr = {
-			.ia_valid = ATTR_UID | ATTR_GID,
-			.ia_uid = uid,
-			.ia_gid = gid,
-		};
-
-		ret = __kernfs_setattr(kn, &iattr);
-		if (ret < 0)
+		kn->iattr = kernfs_alloc_iattrs(uid, gid);
+		if (!kn->iattr) {
+			ret = -ENOMEM;
 			goto err_out3;
+		}
 	}
 
 	if (parent) {
@@ -776,7 +775,8 @@ int kernfs_add_one(struct kernfs_node *kn)
 		goto out_unlock;
 
 	/* Update timestamps on the parent */
-	down_write(&root->kernfs_iattr_rwsem);
+	mutex_lock(&root->kernfs_iattr_lock);
+	write_seqcount_begin(&root->kernfs_iattr_seq);
 
 	ps_iattr = parent->iattr;
 	if (ps_iattr) {
@@ -784,7 +784,8 @@ int kernfs_add_one(struct kernfs_node *kn)
 		ps_iattr->ia_mtime = ps_iattr->ia_ctime;
 	}
 
-	up_write(&root->kernfs_iattr_rwsem);
+	write_seqcount_end(&root->kernfs_iattr_seq);
+	mutex_unlock(&root->kernfs_iattr_lock);
 	up_write(&root->kernfs_rwsem);
 
 	/*
@@ -949,7 +950,8 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
 
 	idr_init(&root->ino_idr);
 	init_rwsem(&root->kernfs_rwsem);
-	init_rwsem(&root->kernfs_iattr_rwsem);
+	mutex_init(&root->kernfs_iattr_lock);
+	seqcount_mutex_init(&root->kernfs_iattr_seq, &root->kernfs_iattr_lock);
 	init_rwsem(&root->kernfs_supers_rwsem);
 	INIT_LIST_HEAD(&root->supers);
 
@@ -1473,14 +1475,16 @@ static void __kernfs_remove(struct kernfs_node *kn)
 				pos->parent ? pos->parent->iattr : NULL;
 
 			/* update timestamps on the parent */
-			down_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
+			mutex_lock(&kernfs_root(kn)->kernfs_iattr_lock);
+			write_seqcount_begin(&kernfs_root(kn)->kernfs_iattr_seq);
 
 			if (ps_iattr) {
 				ktime_get_real_ts64(&ps_iattr->ia_ctime);
 				ps_iattr->ia_mtime = ps_iattr->ia_ctime;
 			}
 
-			up_write(&kernfs_root(kn)->kernfs_iattr_rwsem);
+			write_seqcount_end(&kernfs_root(kn)->kernfs_iattr_seq);
+			mutex_unlock(&kernfs_root(kn)->kernfs_iattr_lock);
 			kernfs_put(pos);
 		}
 
diff --git a/fs/kernfs/inode.c b/fs/kernfs/inode.c
index b83054da68b3..4b77931c814d 100644
--- a/fs/kernfs/inode.c
+++ b/fs/kernfs/inode.c
@@ -24,56 +24,49 @@ static const struct inode_operations kernfs_iops = {
 	.listxattr	= kernfs_iop_listxattr,
 };
 
-static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, int alloc)
+struct kernfs_iattrs *kernfs_alloc_iattrs(kuid_t uid, kgid_t gid)
 {
-	static DEFINE_MUTEX(iattr_mutex);
-	struct kernfs_iattrs *ret;
+	struct kernfs_iattrs *ret = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL);
 
-	mutex_lock(&iattr_mutex);
+	if (ret) {
+		ret->ia_uid = uid;
+		ret->ia_gid = gid;
 
-	if (kn->iattr || !alloc)
-		goto out_unlock;
+		ktime_get_real_ts64(&ret->ia_atime);
+		ret->ia_ctime = ret->ia_mtime = ret->ia_atime;
 
-	kn->iattr = kmem_cache_zalloc(kernfs_iattrs_cache, GFP_KERNEL);
-	if (!kn->iattr)
-		goto out_unlock;
-
-	/* assign default attributes */
-	kn->iattr->ia_uid = GLOBAL_ROOT_UID;
-	kn->iattr->ia_gid = GLOBAL_ROOT_GID;
-
-	ktime_get_real_ts64(&kn->iattr->ia_atime);
-	kn->iattr->ia_mtime = kn->iattr->ia_atime;
-	kn->iattr->ia_ctime = kn->iattr->ia_atime;
-
-	simple_xattrs_init(&kn->iattr->xattrs);
-	atomic_set(&kn->iattr->nr_user_xattrs, 0);
-	atomic_set(&kn->iattr->user_xattr_size, 0);
-out_unlock:
-	ret = kn->iattr;
-	mutex_unlock(&iattr_mutex);
+		simple_xattrs_init(&ret->xattrs);
+		atomic_set(&ret->nr_user_xattrs, 0);
+		atomic_set(&ret->user_xattr_size, 0);
+	}
 	return ret;
 }
 
 static struct kernfs_iattrs *kernfs_iattrs(struct kernfs_node *kn)
 {
-	return __kernfs_iattrs(kn, 1);
+	struct kernfs_iattrs *ret = READ_ONCE(kn->iattr);
+
+	if (!ret) {
+		struct kernfs_iattrs *new;
+		new = kernfs_alloc_iattrs(GLOBAL_ROOT_UID, GLOBAL_ROOT_GID);
+		ret = cmpxchg(&kn->iattr, NULL, new);
+		if (likely(!ret))
+			return new;
+		kfree(new);
+	}
+	return ret;
 }
 
 static struct kernfs_iattrs *kernfs_iattrs_noalloc(struct kernfs_node *kn)
 {
-	return __kernfs_iattrs(kn, 0);
+	return READ_ONCE(kn->iattr);
 }
 
-int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
+static void __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
 {
-	struct kernfs_iattrs *attrs;
+	struct kernfs_iattrs *attrs = kernfs_iattrs_noalloc(kn);
 	unsigned int ia_valid = iattr->ia_valid;
 
-	attrs = kernfs_iattrs(kn);
-	if (!attrs)
-		return -ENOMEM;
-
 	if (ia_valid & ATTR_UID)
 		attrs->ia_uid = iattr->ia_uid;
 	if (ia_valid & ATTR_GID)
@@ -86,7 +79,6 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
 		attrs->ia_ctime = iattr->ia_ctime;
 	if (ia_valid & ATTR_MODE)
 		kn->mode = iattr->ia_mode;
-	return 0;
 }
 
 /**
@@ -98,13 +90,17 @@ int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
  */
 int kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr)
 {
-	int ret;
 	struct kernfs_root *root = kernfs_root(kn);
 
-	down_write(&root->kernfs_iattr_rwsem);
-	ret = __kernfs_setattr(kn, iattr);
-	up_write(&root->kernfs_iattr_rwsem);
-	return ret;
+	if (!kernfs_iattrs(kn))
+		return -ENOMEM;
+
+	mutex_lock(&root->kernfs_iattr_lock);
+	write_seqcount_begin(&root->kernfs_iattr_seq);
+	__kernfs_setattr(kn, iattr);
+	write_seqcount_end(&root->kernfs_iattr_seq);
+	mutex_unlock(&root->kernfs_iattr_lock);
+	return 0;
 }
 
 int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
@@ -117,22 +113,23 @@ int kernfs_iop_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
 
 	if (!kn)
 		return -EINVAL;
+	if (!kernfs_iattrs(kn))
+		return -ENOMEM;
 
 	root = kernfs_root(kn);
-	down_write(&root->kernfs_iattr_rwsem);
+	mutex_lock(&root->kernfs_iattr_lock);
+	write_seqcount_begin(&root->kernfs_iattr_seq);
 	error = setattr_prepare(&nop_mnt_idmap, dentry, iattr);
 	if (error)
 		goto out;
 
-	error = __kernfs_setattr(kn, iattr);
-	if (error)
-		goto out;
-
+	__kernfs_setattr(kn, iattr);
 	/* this ignores size changes */
 	setattr_copy(&nop_mnt_idmap, inode, iattr);
 
 out:
-	up_write(&root->kernfs_iattr_rwsem);
+	write_seqcount_end(&root->kernfs_iattr_seq);
+	mutex_unlock(&root->kernfs_iattr_lock);
 	return error;
 }
 
@@ -187,11 +184,13 @@ int kernfs_iop_getattr(struct mnt_idmap *idmap,
 	struct inode *inode = d_inode(path->dentry);
 	struct kernfs_node *kn = inode->i_private;
 	struct kernfs_root *root = kernfs_root(kn);
+	unsigned seq;
 
-	down_read(&root->kernfs_iattr_rwsem);
-	kernfs_refresh_inode(kn, inode);
-	generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
-	up_read(&root->kernfs_iattr_rwsem);
+	do {
+		seq = read_seqcount_begin(&root->kernfs_iattr_seq);
+		kernfs_refresh_inode(kn, inode);
+		generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat);
+	} while (read_seqcount_retry(&root->kernfs_iattr_seq, seq));
 
 	return 0;
 }
@@ -276,7 +275,7 @@ int kernfs_iop_permission(struct mnt_idmap *idmap,
 {
 	struct kernfs_node *kn;
 	struct kernfs_root *root;
-	int ret;
+	unsigned seq;
 
 	if (mask & MAY_NOT_BLOCK)
 		return -ECHILD;
@@ -284,12 +283,11 @@ int kernfs_iop_permission(struct mnt_idmap *idmap,
 	kn = inode->i_private;
 	root = kernfs_root(kn);
 
-	down_read(&root->kernfs_iattr_rwsem);
-	kernfs_refresh_inode(kn, inode);
-	ret = generic_permission(&nop_mnt_idmap, inode, mask);
-	up_read(&root->kernfs_iattr_rwsem);
-
-	return ret;
+	do {
+		seq = read_seqcount_begin(&root->kernfs_iattr_seq);
+		kernfs_refresh_inode(kn, inode);
+	} while (read_seqcount_retry(&root->kernfs_iattr_seq, seq));
+	return generic_permission(&nop_mnt_idmap, inode, mask);
 }
 
 int kernfs_xattr_get(struct kernfs_node *kn, const char *name,
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 237f2764b941..0aea5151ed1a 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -47,7 +47,8 @@ struct kernfs_root {
 
 	wait_queue_head_t	deactivate_waitq;
 	struct rw_semaphore	kernfs_rwsem;
-	struct rw_semaphore	kernfs_iattr_rwsem;
+	struct mutex		kernfs_iattr_lock;
+	seqcount_mutex_t	kernfs_iattr_seq;
 	struct rw_semaphore	kernfs_supers_rwsem;
 };
 
@@ -137,7 +138,7 @@ int kernfs_iop_getattr(struct mnt_idmap *idmap,
 		       const struct path *path, struct kstat *stat,
 		       u32 request_mask, unsigned int query_flags);
 ssize_t kernfs_iop_listxattr(struct dentry *dentry, char *buf, size_t size);
-int __kernfs_setattr(struct kernfs_node *kn, const struct iattr *iattr);
+struct kernfs_iattrs *kernfs_alloc_iattrs(kuid_t uid, kgid_t gid);
 
 /*
  * dir.c




[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [NTFS 3]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [NTFS 3]     [Samba]     [Device Mapper]     [CEPH Development]

  Powered by Linux