Re: [PATCH] vfs: get_next_ino(), never inum=0

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Apr 29, 2014, at 10:08 PM, J. R. Okajima <hooanon05g@xxxxxxxxx> wrote:
> Christoph Hellwig wrote:
>> If you care about really unique inode numbers you shouldn't use get_next_ino
>> but something like an idr allocator.  The default i_ino assigned in
>> new_inode() from which get_next_ino was factored out was mostly intended
>> for small synthetic filesystems with few enough inodes that it wouldn't
>> wrap around.
> 
> Grep-ping get_next_ino, I got 30 calls in mainline.
> How many of them are get_next_ino() inappropriate? I don't know. But at
> least for tmpfs, it is better to manage the inums by itself since tmpfs
> must be one of the biggest consumer of inums and it is NFS-exportable.
> 
> Do you think we need a common function in VFS to manage inums per sb, or
> it is totally up to filesystem and the common function is unnecessary?
> 
> Instead of idr, I was thinking about a simple bitmap in tmpfs such like
> this. It introduces a new mount option "ino" which forces tmpfs to
> assign the lowest unused number for a new inode within the mounted
> tmpfs. Without "ino" or specifying "noino", the behaviour is unchanged
> (use vfs:get_next_ino()).
> But it may not scale well due to the single spinlock every time.

The simplest solution is to just change get_next_ino() to return an
unsigned long to match i_ino, instead of an int.  That avoids any
overhead in the most common cases (i.e. 64-bit systems where I highly
doubt there will ever be a counter wrap).

We've also been discussing changing i_ino to be u64 so that this works
properly on 32-bit systems accessing 64-bit filesystems, but I don't
know where that stands today.


For 32-bit systems it would be possible to use get_next_ino() for the
common case of inode numbers < 2^32, and only fall back to doing a
lookup for an already-used inode in tmpfs if the counter wraps to 1.

That would avoid overhead for 99% of users since they are unlikely
to create more than 2^32 inodes in tmpfs over the lifetime of their
system.  Even in the check-if-inum-in-use case after the 2^32 wrap,
it is very unlikely that many inodes would still be in use so the
hash lookup should go relatively quickly.

It could use something like an optimized find_inode() that just
determined quickly if the hash entry was in use.  That would avoid
the constant spinlock contention in the most common cases, and only
impose it for systems in rare cases.

That said, I expect this overhead is more than just going to u64 for
32-bit systems.

Cheers, Andreas

> 
> commit 214d38e8c34fb341fd0f37cc92614b5e93e0803b
> Author: J. R. Okajima <hooanon05@xxxxxxxxxxx>
> Date:   Mon Sep 2 10:45:42 2013 +0900
> 
>    shmem: management for inum
> 
>    Signed-off-by: J. R. Okajima <hooanon05g@xxxxxxxxx>
> 
> diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
> index 4d1771c..39762e1 100644
> --- a/include/linux/shmem_fs.h
> +++ b/include/linux/shmem_fs.h
> @@ -29,6 +29,8 @@ struct shmem_sb_info {
> 	unsigned long max_inodes;   /* How many inodes are allowed */
> 	unsigned long free_inodes;  /* How many are left for allocation */
> 	spinlock_t stat_lock;	    /* Serialize shmem_sb_info changes */
> +	spinlock_t ino_lock;
> +	unsigned long *ino_bitmap;
> 	kuid_t uid;		    /* Mount uid for root directory */
> 	kgid_t gid;		    /* Mount gid for root directory */
> 	umode_t mode;		    /* Mount mode for root directory */
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 9f70e02..bc2c5e4 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -197,14 +197,61 @@ static int shmem_reserve_inode(struct super_block *sb)
> 	return 0;
> }
> 
> -static void shmem_free_inode(struct super_block *sb)
> +static void shmem_free_inode(struct inode *inode)
> {
> +	struct super_block *sb = inode->i_sb;
> 	struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
> +
> 	if (sbinfo->max_inodes) {
> 		spin_lock(&sbinfo->stat_lock);
> 		sbinfo->free_inodes++;
> 		spin_unlock(&sbinfo->stat_lock);
> 	}
> +
> +	if (!inode->i_nlink) {
> +		spin_lock(&sbinfo->ino_lock);
> +		if (sbinfo->ino_bitmap)
> +			clear_bit(inode->i_ino - 2, sbinfo->ino_bitmap);
> +		spin_unlock(&sbinfo->ino_lock);
> +	}
> +}
> +
> +/*
> + * This is unsigned int instead of unsigned long.
> + * For details, see fs/inode.c:get_next_ino().
> + */
> +unsigned int shmem_next_ino(struct super_block *sb)
> +{
> +	unsigned long ino;
> +	struct shmem_sb_info *sbinfo;
> +
> +	ino = 0;
> +	sbinfo = SHMEM_SB(sb);
> +	if (sbinfo->ino_bitmap) {
> +		spin_lock(&sbinfo->ino_lock);
> +		/*
> +		 * someone else may remount,
> +		 * and ino_bitmap might be reset.
> +		 */
> +		if (sbinfo->ino_bitmap
> +		    && !bitmap_full(sbinfo->ino_bitmap, sbinfo->max_inodes)) {
> +			ino = find_first_zero_bit(sbinfo->ino_bitmap,
> +						  sbinfo->max_inodes);
> +			set_bit(ino, sbinfo->ino_bitmap);
> +			ino += 2; /* ino 0 and 1 are reserved */
> +		}
> +		spin_unlock(&sbinfo->ino_lock);
> +	}
> +
> +	/*
> +	 * someone else did remount,
> +	 * or ino_bitmap is unused originally,
> +	 * or ino_bimapt is full.
> +	 */
> +	if (!ino)
> +		ino = get_next_ino();
> +
> +	return ino;
> }
> 
> /**
> @@ -578,7 +625,7 @@ static void shmem_evict_inode(struct inode *inode)
> 
> 	simple_xattrs_free(&info->xattrs);
> 	WARN_ON(inode->i_blocks);
> -	shmem_free_inode(inode->i_sb);
> +	shmem_free_inode(inode);
> 	clear_inode(inode);
> }
> 
> @@ -1306,7 +1353,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
> 
> 	inode = new_inode(sb);
> 	if (inode) {
> -		inode->i_ino = get_next_ino();
> +		inode->i_ino = shmem_next_ino(sb);
> 		inode_init_owner(inode, dir, mode);
> 		inode->i_blocks = 0;
> 		inode->i_mapping->backing_dev_info = &shmem_backing_dev_info;
> @@ -1348,7 +1395,7 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
> 			break;
> 		}
> 	} else
> -		shmem_free_inode(sb);
> +		shmem_free_inode(inode);
> 	return inode;
> }
> 
> @@ -1945,7 +1992,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry)
> 	struct inode *inode = dentry->d_inode;
> 
> 	if (inode->i_nlink > 1 && !S_ISDIR(inode->i_mode))
> -		shmem_free_inode(inode->i_sb);
> +		shmem_free_inode(inode);
> 
> 	dir->i_size -= BOGO_DIRENT_SIZE;
> 	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
> @@ -2315,6 +2362,54 @@ static const struct export_operations shmem_export_ops = {
> 	.fh_to_dentry	= shmem_fh_to_dentry,
> };
> 
> +static void shmem_ino_bitmap(struct shmem_sb_info *sbinfo,
> +			     unsigned long prev_max)
> +{
> +	unsigned long *p;
> +	unsigned long n, d;
> +	int do_msg;
> +
> +	n = sbinfo->max_inodes / BITS_PER_BYTE;
> +	if (sbinfo->max_inodes % BITS_PER_BYTE)
> +		n++;
> +
> +	do_msg = 0;
> +	if (sbinfo->ino_bitmap) {
> +		/*
> +		 * by shrinking the bitmap, the large inode number in use
> +		 * may be left. but it is harmless.
> +		 */
> +		d = 0;
> +		if (sbinfo->max_inodes > prev_max) {
> +			d = sbinfo->max_inodes - prev_max;
> +			d /= BITS_PER_BYTE;
> +		}
> +		spin_lock(&sbinfo->ino_lock);
> +		p = krealloc(sbinfo->ino_bitmap, n, GFP_NOWAIT);
> +		if (p) {
> +			memset(p + n - d, 0, d);
> +			sbinfo->ino_bitmap = p;
> +			spin_unlock(&sbinfo->ino_lock);
> +		} else {
> +			p = sbinfo->ino_bitmap;
> +			sbinfo->ino_bitmap = NULL;
> +			spin_unlock(&sbinfo->ino_lock);
> +			kfree(p);
> +			do_msg = 1;
> +		}
> +	} else {
> +		p = kzalloc(n, GFP_NOFS);
> +		spin_lock(&sbinfo->ino_lock);
> +		sbinfo->ino_bitmap = p;
> +		spin_unlock(&sbinfo->ino_lock);
> +		do_msg = !p;
> +	}
> +
> +	if (unlikely(do_msg))
> +		pr_err("%s: ino failed (%lu bytes). Ignored.\n",
> +		       __func__, n);
> +}
> +
> static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
> 			       bool remount)
> {
> @@ -2322,7 +2417,10 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
> 	struct mempolicy *mpol = NULL;
> 	uid_t uid;
> 	gid_t gid;
> +	bool do_ino;
> +	unsigned long old_val = sbinfo->max_inodes;
> 
> +	do_ino = 0;
> 	while (options != NULL) {
> 		this_char = options;
> 		for (;;) {
> @@ -2342,6 +2440,14 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
> 		}
> 		if (!*this_char)
> 			continue;
> +		if (!strcmp(this_char, "ino")) {
> +			do_ino = 1;
> +			continue;
> +		} else if (!strcmp(this_char, "noino")) {
> +			do_ino = 0;
> +			continue;
> +		}
> +
> 		if ((value = strchr(this_char,'=')) != NULL) {
> 			*value++ = 0;
> 		} else {
> @@ -2370,7 +2476,7 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
> 				goto bad_val;
> 		} else if (!strcmp(this_char,"nr_inodes")) {
> 			sbinfo->max_inodes = memparse(value, &rest);
> -			if (*rest)
> +			if (*rest || !sbinfo->max_inodes)
> 				goto bad_val;
> 		} else if (!strcmp(this_char,"mode")) {
> 			if (remount)
> @@ -2408,6 +2514,16 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo,
> 		}
> 	}
> 	sbinfo->mpol = mpol;
> +
> +	if (do_ino)
> +		shmem_ino_bitmap(sbinfo, old_val);
> +	else if (sbinfo->ino_bitmap) {
> +		void *p = sbinfo->ino_bitmap;
> +		spin_lock(&sbinfo->ino_lock);
> +		sbinfo->ino_bitmap = NULL;
> +		spin_unlock(&sbinfo->ino_lock);
> +		kfree(p);
> +	}
> 	return 0;
> 
> bad_val:
> @@ -2472,6 +2588,8 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
> 			sbinfo->max_blocks << (PAGE_CACHE_SHIFT - 10));
> 	if (sbinfo->max_inodes != shmem_default_max_inodes())
> 		seq_printf(seq, ",nr_inodes=%lu", sbinfo->max_inodes);
> +	if (sbinfo->ino_bitmap)
> +		seq_printf(seq, ",ino");
> 	if (sbinfo->mode != (S_IRWXUGO | S_ISVTX))
> 		seq_printf(seq, ",mode=%03ho", sbinfo->mode);
> 	if (!uid_eq(sbinfo->uid, GLOBAL_ROOT_UID))
> @@ -2491,6 +2609,7 @@ static void shmem_put_super(struct super_block *sb)
> 
> 	percpu_counter_destroy(&sbinfo->used_blocks);
> 	mpol_put(sbinfo->mpol);
> +	kfree(sbinfo->ino_bitmap);
> 	kfree(sbinfo);
> 	sb->s_fs_info = NULL;
> }
> @@ -2510,6 +2629,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
> 	sbinfo->mode = S_IRWXUGO | S_ISVTX;
> 	sbinfo->uid = current_fsuid();
> 	sbinfo->gid = current_fsgid();
> +	spin_lock_init(&sbinfo->ino_lock);
> 	sb->s_fs_info = sbinfo;
> 
> #ifdef CONFIG_TMPFS
> --
> To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Cheers, Andreas





Attachment: signature.asc
Description: Message signed with OpenPGP using GPGMail


[Index of Archives]     [Linux Ext4 Filesystem]     [Union Filesystem]     [Filesystem Testing]     [Ceph Users]     [Ecryptfs]     [AutoFS]     [Kernel Newbies]     [Share Photos]     [Security]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Linux Cachefs]     [Reiser Filesystem]     [Linux RAID]     [Samba]     [Device Mapper]     [CEPH Development]
  Powered by Linux