On Thu, 2003-11-06 at 15:01, Martin Hicks wrote: > > I think it's possible to use alloc_percpu(). The best way would be to > overhaul the percpu_counter structure to use alloc_percpu(). With just > a quick inspection it looks like this would solve the kmalloc size > problems for a long time, although it would effect performance because > you do one kmalloc() per processor when you do alloc_percpu() > > Let me take a look at that... Here's the patch. I tested it on linux-2.6.0-test9 on ia64. It seems to work. (Please test with caution, of course) I didn't do any benchmarking, because I don't have a reasonable filesystem benchmark setup. Any opinions? patches attached are to change percpu_counters to use alloc_percpu() and the patch to ext3 to make it have pointers to percpu_counters. The latter is probably not even required anymore, since there is no array of size NR_CPUS in the struct percpu_counters anymore. mh -- Martin Hicks Wild Open Source Inc. mort@xxxxxxxxxxxxxxxxxx 613-266-2296
# This is a BitKeeper generated patch for the following project: # Project Name: Linux kernel tree # This patch format is intended for GNU patch command version 2.5 or higher. # This patch includes the following deltas: # ChangeSet 1.1356 -> 1.1357 # fs/ext3/balloc.c 1.18 -> 1.19 # fs/ext3/ialloc.c 1.31 -> 1.32 # fs/ext3/super.c 1.79 -> 1.80 # include/linux/ext3_fs_sb.h 1.7 -> 1.8 # # The following is the BitKeeper ChangeSet Log # -------------------------------------------- # 03/11/06 mort@xxxxxxxxxxxxxxxx 1.1357 # Change the percpu_counters in the ext3_sb_info struct to be pointers. # This gets ext3 linking with a large NR_CPUS. Tested with 512. # -------------------------------------------- # diff -Nru a/fs/ext3/balloc.c b/fs/ext3/balloc.c --- a/fs/ext3/balloc.c Fri Nov 7 13:30:23 2003 +++ b/fs/ext3/balloc.c Fri Nov 7 13:30:23 2003 @@ -254,7 +254,7 @@ cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) + dquot_freed_blocks); spin_unlock(sb_bgl_lock(sbi, block_group)); - percpu_counter_mod(&sbi->s_freeblocks_counter, count); + percpu_counter_mod(sbi->s_freeblocks_counter, count); /* We dirtied the bitmap block */ BUFFER_TRACE(bitmap_bh, "dirtied bitmap block"); @@ -511,7 +511,7 @@ es = EXT3_SB(sb)->s_es; ext3_debug("goal=%lu.\n", goal); - free_blocks = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + free_blocks = percpu_counter_read_positive(sbi->s_freeblocks_counter); root_blocks = le32_to_cpu(es->s_r_blocks_count); if (free_blocks < root_blocks + 1 && !capable(CAP_SYS_RESOURCE) && sbi->s_resuid != current->fsuid && @@ -652,7 +652,7 @@ gdp->bg_free_blocks_count = cpu_to_le16(le16_to_cpu(gdp->bg_free_blocks_count) - 1); spin_unlock(sb_bgl_lock(sbi, group_no)); - percpu_counter_mod(&sbi->s_freeblocks_counter, -1); + percpu_counter_mod(sbi->s_freeblocks_counter, -1); BUFFER_TRACE(gdp_bh, "journal_dirty_metadata for group descriptor"); err = ext3_journal_dirty_metadata(handle, gdp_bh); diff -Nru a/fs/ext3/ialloc.c b/fs/ext3/ialloc.c --- a/fs/ext3/ialloc.c Fri Nov 7 13:30:23 2003 +++ b/fs/ext3/ialloc.c Fri Nov 7 13:30:23 2003 @@ -169,9 +169,9 @@ gdp->bg_used_dirs_count = cpu_to_le16( le16_to_cpu(gdp->bg_used_dirs_count) - 1); spin_unlock(sb_bgl_lock(sbi, block_group)); - percpu_counter_inc(&sbi->s_freeinodes_counter); + percpu_counter_inc(sbi->s_freeinodes_counter); if (is_directory) - percpu_counter_dec(&sbi->s_dirs_counter); + percpu_counter_dec(sbi->s_dirs_counter); } BUFFER_TRACE(bh2, "call ext3_journal_dirty_metadata"); @@ -206,7 +206,7 @@ struct buffer_head *bh; int group, best_group = -1; - freei = percpu_counter_read_positive(&EXT3_SB(sb)->s_freeinodes_counter); + freei = percpu_counter_read_positive(EXT3_SB(sb)->s_freeinodes_counter); avefreei = freei / ngroups; for (group = 0; group < ngroups; group++) { @@ -268,11 +268,11 @@ struct ext3_group_desc *desc; struct buffer_head *bh; - freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter); + freei = percpu_counter_read_positive(sbi->s_freeinodes_counter); avefreei = freei / ngroups; - freeb = percpu_counter_read_positive(&sbi->s_freeblocks_counter); + freeb = percpu_counter_read_positive(sbi->s_freeblocks_counter); avefreeb = freeb / ngroups; - ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter); + ndirs = percpu_counter_read_positive(sbi->s_dirs_counter); if ((parent == sb->s_root->d_inode) || (parent->i_flags & EXT3_TOPDIR_FL)) { @@ -533,9 +533,9 @@ err = ext3_journal_dirty_metadata(handle, bh2); if (err) goto fail; - percpu_counter_dec(&sbi->s_freeinodes_counter); + percpu_counter_dec(sbi->s_freeinodes_counter); if (S_ISDIR(mode)) - percpu_counter_inc(&sbi->s_dirs_counter); + percpu_counter_inc(sbi->s_dirs_counter); sb->s_dirt = 1; inode->i_uid = current->fsuid; diff -Nru a/fs/ext3/super.c b/fs/ext3/super.c --- a/fs/ext3/super.c Fri Nov 7 13:30:23 2003 +++ b/fs/ext3/super.c Fri Nov 7 13:30:24 2003 @@ -421,6 +421,9 @@ ext3_blkdev_remove(sbi); } sb->s_fs_info = NULL; + kfree(sbi->s_freeblocks_counter); + kfree(sbi->s_freeinodes_counter); + kfree(sbi->s_dirs_counter); kfree(sbi); return; } @@ -848,6 +851,32 @@ return res; } +static int ext3_setup_sbi (struct ext3_sb_info *sbi) +{ + memset(sbi, 0, sizeof(*sbi)); + + sbi->s_freeblocks_counter = kmalloc(sizeof(struct percpu_counter), GFP_KERNEL); + if (!sbi->s_freeblocks_counter) + return -ENOMEM; + sbi->s_freeinodes_counter = kmalloc(sizeof(struct percpu_counter), GFP_KERNEL); + if (!sbi->s_freeinodes_counter) + goto out_freeblocks; + sbi->s_dirs_counter = kmalloc(sizeof(struct percpu_counter), GFP_KERNEL); + if (!sbi->s_dirs_counter) + goto out_freeinode; + + sbi->s_mount_opt = 0; + sbi->s_resuid = EXT3_DEF_RESUID; + sbi->s_resgid = EXT3_DEF_RESGID; + return 0; + + out_freeinode: + kfree(sbi->s_freeinodes_counter); + out_freeblocks: + kfree(sbi->s_freeblocks_counter); + return -ENOMEM; +} + static int ext3_check_descriptors (struct super_block * sb) { struct ext3_sb_info *sbi = EXT3_SB(sb); @@ -1048,10 +1077,10 @@ if (!sbi) return -ENOMEM; sb->s_fs_info = sbi; - memset(sbi, 0, sizeof(*sbi)); - sbi->s_mount_opt = 0; - sbi->s_resuid = EXT3_DEF_RESUID; - sbi->s_resgid = EXT3_DEF_RESGID; + if (ext3_setup_sbi(sbi)) { + kfree(sbi); + return -ENOMEM; + } blocksize = sb_min_blocksize(sb, EXT3_MIN_BLOCK_SIZE); if (!blocksize) { @@ -1266,9 +1295,9 @@ } memset(sbi->s_debts, 0, sbi->s_groups_count * sizeof(u8)); - percpu_counter_init(&sbi->s_freeblocks_counter); - percpu_counter_init(&sbi->s_freeinodes_counter); - percpu_counter_init(&sbi->s_dirs_counter); + percpu_counter_init(sbi->s_freeblocks_counter); + percpu_counter_init(sbi->s_freeinodes_counter); + percpu_counter_init(sbi->s_dirs_counter); bgl_lock_init(&sbi->s_blockgroup_lock); for (i = 0; i < db_count; i++) { @@ -1383,11 +1412,11 @@ test_opt(sb,DATA_FLAGS) == EXT3_MOUNT_ORDERED_DATA ? "ordered": "writeback"); - percpu_counter_mod(&sbi->s_freeblocks_counter, + percpu_counter_mod(sbi->s_freeblocks_counter, ext3_count_free_blocks(sb)); - percpu_counter_mod(&sbi->s_freeinodes_counter, + percpu_counter_mod(sbi->s_freeinodes_counter, ext3_count_free_inodes(sb)); - percpu_counter_mod(&sbi->s_dirs_counter, + percpu_counter_mod(sbi->s_dirs_counter, ext3_count_dirs(sb)); return 0; @@ -1404,6 +1433,9 @@ brelse(bh); out_fail: sb->s_fs_info = NULL; + kfree(sbi->s_freeblocks_counter); + kfree(sbi->s_freeinodes_counter); + kfree(sbi->s_dirs_counter); kfree(sbi); return -EINVAL; } diff -Nru a/include/linux/ext3_fs_sb.h b/include/linux/ext3_fs_sb.h --- a/include/linux/ext3_fs_sb.h Fri Nov 7 13:30:24 2003 +++ b/include/linux/ext3_fs_sb.h Fri Nov 7 13:30:24 2003 @@ -53,9 +53,9 @@ u32 s_hash_seed[4]; int s_def_hash_version; u8 *s_debts; - struct percpu_counter s_freeblocks_counter; - struct percpu_counter s_freeinodes_counter; - struct percpu_counter s_dirs_counter; + struct percpu_counter *s_freeblocks_counter; + struct percpu_counter *s_freeinodes_counter; + struct percpu_counter *s_dirs_counter; struct blockgroup_lock s_blockgroup_lock; /* Journaling */
# This is a BitKeeper generated patch for the following project: # Project Name: Linux kernel tree # This patch format is intended for GNU patch command version 2.5 or higher. # This patch includes the following deltas: # ChangeSet 1.1357 -> 1.1358 # lib/percpu_counter.c 1.3 -> 1.4 # include/linux/percpu_counter.h 1.2 -> 1.3 # # The following is the BitKeeper ChangeSet Log # -------------------------------------------- # 03/11/07 mort@xxxxxxxxxxxxxxxx 1.1358 # percpu_counters-update.diff # -------------------------------------------- # diff -Nru a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h --- a/include/linux/percpu_counter.h Fri Nov 7 13:30:04 2003 +++ b/include/linux/percpu_counter.h Fri Nov 7 13:30:04 2003 @@ -8,6 +8,7 @@ #include <linux/spinlock.h> #include <linux/smp.h> #include <linux/threads.h> +#include <linux/percpu.h> #ifdef CONFIG_SMP @@ -18,7 +19,7 @@ struct percpu_counter { spinlock_t lock; long count; - struct __percpu_counter counters[NR_CPUS]; + struct __percpu_counter *counters; }; #if NR_CPUS >= 16 @@ -29,12 +30,14 @@ static inline void percpu_counter_init(struct percpu_counter *fbc) { - int i; - spin_lock_init(&fbc->lock); fbc->count = 0; - for (i = 0; i < NR_CPUS; i++) - fbc->counters[i].count = 0; + fbc->counters = alloc_percpu(struct __percpu_counter); +} + +static inline void percpu_counter_destroy(struct percpu_counter *fbc) +{ + free_percpu(fbc->counters); } void percpu_counter_mod(struct percpu_counter *fbc, long amount); @@ -67,6 +70,10 @@ static inline void percpu_counter_init(struct percpu_counter *fbc) { fbc->count = 0; +} + +static inline void percpu_counter_destroy(struct percpu_counter *fbc) +{ } static inline void diff -Nru a/lib/percpu_counter.c b/lib/percpu_counter.c --- a/lib/percpu_counter.c Fri Nov 7 13:30:04 2003 +++ b/lib/percpu_counter.c Fri Nov 7 13:30:04 2003 @@ -4,9 +4,10 @@ void percpu_counter_mod(struct percpu_counter *fbc, long amount) { - int cpu = get_cpu(); - long count = fbc->counters[cpu].count; + long count; + count = *(long *)get_cpu_ptr(fbc->counters); + put_cpu_ptr(); count += amount; if (count >= FBC_BATCH || count <= -FBC_BATCH) { spin_lock(&fbc->lock); @@ -14,8 +15,8 @@ spin_unlock(&fbc->lock); count = 0; } - fbc->counters[cpu].count = count; - put_cpu(); + *(long *)get_cpu_ptr(fbc->counters) = count; + put_cpu_ptr(); } EXPORT_SYMBOL(percpu_counter_mod);