[patch] fs: use fast counters for vfs caches

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Sorry, forgot to cc linux-fsdevel
--- Begin Message ---
Hey,

What was the reason behind not using my approach to use fast per-cpu
counters for inode and dentry counters, and instead using the
percpu_counter lib (which is not useful unless very fast approximate
access to the global counter is required, or performance is not
critical, which is somewhat of an oxymoron if you're using per-counters
in the first place). It is a difference between this:

        incl %gs:nr_dentry      # nr_dentry

versus this horrible thing:

        movl    percpu_counter_batch(%rip), %edx        # percpu_counter_batch,
        movl    $1, %esi        #,
        movq    $nr_dentry, %rdi        #,
        call    __percpu_counter_add    # (plus I clobber registers)

__percpu_counter_add:
        pushq   %rbp    #
        movq    %rsp, %rbp      #,
        subq    $32, %rsp       #,
        movq    %rbx, -24(%rbp) #,
        movq    %r12, -16(%rbp) #,
        movq    %r13, -8(%rbp)  #,
        movq    %rdi, %rbx      # fbc, fbc
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
        movq %gs:kernel_stack,%rax      #, pfo_ret__
# 0 "" 2
#NO_APP
        incl    -8124(%rax)     # <variable>.preempt_count
        movq    32(%rdi), %r12  # <variable>.counters, tcp_ptr__
#APP
# 78 "lib/percpu_counter.c" 1
        add %gs:this_cpu_off, %r12      # this_cpu_off, tcp_ptr__
# 0 "" 2
#NO_APP
        movslq  (%r12),%r13     #* tcp_ptr__, tmp73
        movslq  %edx,%rax       # batch, batch
        addq    %rsi, %r13      # amount, count
        cmpq    %rax, %r13      # batch, count
        jge     .L27    #,
        negl    %edx    # tmp76
        movslq  %edx,%rdx       # tmp76, tmp77
        cmpq    %rdx, %r13      # tmp77, count
        jg      .L28    #,
.L27:
        movq    %rbx, %rdi      # fbc,
        call    _raw_spin_lock  #
        addq    %r13, 8(%rbx)   # count, <variable>.count
        movq    %rbx, %rdi      # fbc,
        movl    $0, (%r12)      #,* tcp_ptr__
        call    _raw_spin_unlock        #
.L29:
#APP
# 216 "/home/npiggin/usr/src/linux-2.6/arch/x86/include/asm/thread_info.h" 1
        movq %gs:kernel_stack,%rax      #, pfo_ret__
# 0 "" 2
#NO_APP
        decl    -8124(%rax)     # <variable>.preempt_count
        movq    -8136(%rax), %rax       #, D.14625
        testb   $8, %al #, D.14625
        jne     .L32    #,
.L31:
        movq    -24(%rbp), %rbx #,
        movq    -16(%rbp), %r12 #,
        movq    -8(%rbp), %r13  #,
        leave
        ret
        .p2align 4,,10
        .p2align 3
.L28:
        movl    %r13d, (%r12)   # count,*
        jmp     .L29    #
.L32:
        call    preempt_schedule        #
        .p2align 4,,6
        jmp     .L31    #
        .size   __percpu_counter_add, .-__percpu_counter_add
        .p2align 4,,15

So, fix it.

Signed-off-by: Nick Piggin <npiggin@xxxxxxxxx>

---
I was really trying to be very careful with single thread performance and
count cycles of every change made :( No matter what happens, fine
grained locking will bloat things up and slow them down, so every effort
has to be made to ensure it is done with as little impact as possible.

Also, making LRU counters per-cpu is not really that useful because the
LRU never becomes a percpu structure. In my patches, they become per-zone
counters, which is reasonable since all the other LRU manipulations are
per-zone as well. I'll have to change that back at some point, too.

---
 fs/dcache.c |   43 +++++++++++++++++++++++++++++--------------
 fs/inode.c  |   46 ++++++++++++++++++++++++++--------------------
 2 files changed, 55 insertions(+), 34 deletions(-)

Index: linux-2.6/fs/dcache.c
===================================================================
--- linux-2.6.orig/fs/dcache.c	2010-11-29 21:10:35.000000000 +1100
+++ linux-2.6/fs/dcache.c	2010-11-29 21:15:56.000000000 +1100
@@ -67,15 +67,33 @@ struct dentry_stat_t dentry_stat = {
 	.age_limit = 45,
 };
 
-static struct percpu_counter nr_dentry __cacheline_aligned_in_smp;
-static struct percpu_counter nr_dentry_unused __cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(unsigned int, nr_dentry);
+static DEFINE_PER_CPU(unsigned int, nr_dentry_unused);
+
+static int get_nr_dentry(void)
+{
+	int i;
+	int sum = 0;
+	for_each_possible_cpu(i)
+		sum += per_cpu(nr_dentry, i);
+	return sum < 0 ? 0 : sum;
+}
+
+static int get_nr_dentry_unused(void)
+{
+	int i;
+	int sum = 0;
+	for_each_possible_cpu(i)
+		sum += per_cpu(nr_dentry_unused, i);
+	return sum < 0 ? 0 : sum;
+}
 
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_PROC_FS)
 int proc_nr_dentry(ctl_table *table, int write, void __user *buffer,
 		   size_t *lenp, loff_t *ppos)
 {
-	dentry_stat.nr_dentry = percpu_counter_sum_positive(&nr_dentry);
-	dentry_stat.nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
+	dentry_stat.nr_dentry = get_nr_dentry();
+	dentry_stat.nr_unused = get_nr_dentry_unused();
 	return proc_dointvec(table, write, buffer, lenp, ppos);
 }
 #endif
@@ -95,7 +113,7 @@ static void __d_free(struct rcu_head *he
  */
 static void d_free(struct dentry *dentry)
 {
-	percpu_counter_dec(&nr_dentry);
+	this_cpu_dec(nr_dentry);
 	if (dentry->d_op && dentry->d_op->d_release)
 		dentry->d_op->d_release(dentry);
 
@@ -140,7 +158,7 @@ static void dentry_lru_add(struct dentry
 	if (list_empty(&dentry->d_lru)) {
 		list_add(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
 		dentry->d_sb->s_nr_dentry_unused++;
-		percpu_counter_inc(&nr_dentry_unused);
+		this_cpu_inc(nr_dentry_unused);
 	}
 }
 
@@ -149,7 +167,7 @@ static void dentry_lru_del(struct dentry
 	if (!list_empty(&dentry->d_lru)) {
 		list_del_init(&dentry->d_lru);
 		dentry->d_sb->s_nr_dentry_unused--;
-		percpu_counter_dec(&nr_dentry_unused);
+		this_cpu_dec(nr_dentry_unused);
 	}
 }
 
@@ -158,7 +176,7 @@ static void dentry_lru_move_tail(struct
 	if (list_empty(&dentry->d_lru)) {
 		list_add_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
 		dentry->d_sb->s_nr_dentry_unused++;
-		percpu_counter_inc(&nr_dentry_unused);
+		this_cpu_inc(nr_dentry_unused);
 	} else {
 		list_move_tail(&dentry->d_lru, &dentry->d_sb->s_dentry_lru);
 	}
@@ -546,7 +564,7 @@ static void prune_dcache(int count)
 {
 	struct super_block *sb, *p = NULL;
 	int w_count;
-	int unused = percpu_counter_sum_positive(&nr_dentry_unused);
+	int unused = get_nr_dentry_unused();
 	int prune_ratio;
 	int pruned;
 
@@ -916,7 +934,7 @@ static int shrink_dcache_memory(struct s
 		prune_dcache(nr);
 	}
 
-	nr_unused = percpu_counter_sum_positive(&nr_dentry_unused);
+	nr_unused = get_nr_dentry_unused();
 	return (nr_unused / 100) * sysctl_vfs_cache_pressure;
 }
 
@@ -986,7 +1004,7 @@ struct dentry *d_alloc(struct dentry * p
 		list_add(&dentry->d_u.d_child, &parent->d_subdirs);
 	spin_unlock(&dcache_lock);
 
-	percpu_counter_inc(&nr_dentry);
+	this_cpu_inc(nr_dentry);
 
 	return dentry;
 }
@@ -2427,9 +2445,6 @@ static void __init dcache_init(void)
 {
 	int loop;
 
-	percpu_counter_init(&nr_dentry, 0);
-	percpu_counter_init(&nr_dentry_unused, 0);
-
 	/* 
 	 * A constructor could be added for stable state like the lists,
 	 * but it is probably not worth it because of the cache nature
Index: linux-2.6/fs/inode.c
===================================================================
--- linux-2.6.orig/fs/inode.c	2010-11-29 21:20:46.000000000 +1100
+++ linux-2.6/fs/inode.c	2010-11-29 21:31:30.000000000 +1100
@@ -102,26 +102,34 @@ static DECLARE_RWSEM(iprune_sem);
  */
 struct inodes_stat_t inodes_stat;
 
-static struct percpu_counter nr_inodes __cacheline_aligned_in_smp;
-static struct percpu_counter nr_inodes_unused __cacheline_aligned_in_smp;
+static DEFINE_PER_CPU(unsigned int, nr_inodes);
+static DEFINE_PER_CPU(unsigned int, nr_inodes_unused);
 
 static struct kmem_cache *inode_cachep __read_mostly;
 
-static inline int get_nr_inodes(void)
+static int get_nr_inodes(void)
 {
-	return percpu_counter_sum_positive(&nr_inodes);
-}
-
-static inline int get_nr_inodes_unused(void)
-{
-	return percpu_counter_sum_positive(&nr_inodes_unused);
+	int i;
+	int sum = 0;
+	for_each_possible_cpu(i)
+		sum += per_cpu(nr_inodes, i);
+	return sum < 0 ? 0 : sum;
+}
+
+static int get_nr_inodes_unused(void)
+{
+	int i;
+	int sum = 0;
+	for_each_possible_cpu(i)
+		sum += per_cpu(nr_inodes_unused, i);
+	return sum < 0 ? 0 : sum;
 }
 
 int get_nr_dirty_inodes(void)
 {
+	/* not actually dirty inodes, but a wild approximation */
 	int nr_dirty = get_nr_inodes() - get_nr_inodes_unused();
 	return nr_dirty > 0 ? nr_dirty : 0;
-
 }
 
 /*
@@ -224,7 +232,7 @@ int inode_init_always(struct super_block
 	inode->i_fsnotify_mask = 0;
 #endif
 
-	percpu_counter_inc(&nr_inodes);
+	this_cpu_inc(nr_inodes);
 
 	return 0;
 out:
@@ -266,7 +274,7 @@ void __destroy_inode(struct inode *inode
 	if (inode->i_default_acl && inode->i_default_acl != ACL_NOT_CACHED)
 		posix_acl_release(inode->i_default_acl);
 #endif
-	percpu_counter_dec(&nr_inodes);
+	this_cpu_dec(nr_inodes);
 }
 EXPORT_SYMBOL(__destroy_inode);
 
@@ -335,7 +343,7 @@ static void inode_lru_list_add(struct in
 {
 	if (list_empty(&inode->i_lru)) {
 		list_add(&inode->i_lru, &inode_lru);
-		percpu_counter_inc(&nr_inodes_unused);
+		this_cpu_inc(nr_inodes_unused);
 	}
 }
 
@@ -343,7 +351,7 @@ static void inode_lru_list_del(struct in
 {
 	if (!list_empty(&inode->i_lru)) {
 		list_del_init(&inode->i_lru);
-		percpu_counter_dec(&nr_inodes_unused);
+		this_cpu_dec(nr_inodes_unused);
 	}
 }
 
@@ -513,7 +521,7 @@ void evict_inodes(struct super_block *sb
 		list_move(&inode->i_lru, &dispose);
 		list_del_init(&inode->i_wb_list);
 		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-			percpu_counter_dec(&nr_inodes_unused);
+			this_cpu_dec(nr_inodes_unused);
 	}
 	spin_unlock(&inode_lock);
 
@@ -554,7 +562,7 @@ int invalidate_inodes(struct super_block
 		list_move(&inode->i_lru, &dispose);
 		list_del_init(&inode->i_wb_list);
 		if (!(inode->i_state & (I_DIRTY | I_SYNC)))
-			percpu_counter_dec(&nr_inodes_unused);
+			this_cpu_dec(nr_inodes_unused);
 	}
 	spin_unlock(&inode_lock);
 
@@ -616,7 +624,7 @@ static void prune_icache(int nr_to_scan)
 		if (atomic_read(&inode->i_count) ||
 		    (inode->i_state & ~I_REFERENCED)) {
 			list_del_init(&inode->i_lru);
-			percpu_counter_dec(&nr_inodes_unused);
+			this_cpu_dec(nr_inodes_unused);
 			continue;
 		}
 
@@ -650,7 +658,7 @@ static void prune_icache(int nr_to_scan)
 		 */
 		list_move(&inode->i_lru, &freeable);
 		list_del_init(&inode->i_wb_list);
-		percpu_counter_dec(&nr_inodes_unused);
+		this_cpu_dec(nr_inodes_unused);
 	}
 	if (current_is_kswapd())
 		__count_vm_events(KSWAPD_INODESTEAL, reap);
@@ -1648,8 +1656,6 @@ void __init inode_init(void)
 					 SLAB_MEM_SPREAD),
 					 init_once);
 	register_shrinker(&icache_shrinker);
-	percpu_counter_init(&nr_inodes, 0);
-	percpu_counter_init(&nr_inodes_unused, 0);
 
 	/* Hash may have been set up in inode_init_early */
 	if (!hashdist)

--- End Message ---

[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux