Re: [PATCH 1/3] ceph: map snapid to anonymous bdev ID

Jeff Layton <jlayton@xxxxxxxxxx> · Fri, 15 Dec 2017 08:48:36 -0500

On Fri, 2017-12-15 at 21:45 +0800, Yan, Zheng wrote:
> > On 15 Dec 2017, at 21:22, Jeff Layton <jlayton@xxxxxxxxxx> wrote:
> > 
> > On Fri, 2017-12-15 at 17:24 +0800, Yan, Zheng wrote:
> > > ceph_getattr() return zero dev ID for head inodes and set dev ID to
> > > snapid directly for snaphost inodes. This is not good because userspace
> > > utilities may consider device ID of 0 as invalid, snapid may conflict
> > > with other device's ID.
> > > 
> > > This patch introduces "snapids to anonymous bdev IDs" map. we create a
> > > new mapping when we see a snapid for the first time. we trim unused
> > > mapping after it is ilde for an hour.
> > > 
> > > Signed-off-by: "Yan, Zheng" <zyan@xxxxxxxxxx>
> > > ---
> > > fs/ceph/addr.c       |   4 +-
> > > fs/ceph/inode.c      |  31 +++++++----
> > > fs/ceph/mds_client.c |   8 +++
> > > fs/ceph/mds_client.h |  13 +++++
> > > fs/ceph/snap.c       | 148 +++++++++++++++++++++++++++++++++++++++++++++++++++
> > > fs/ceph/super.h      |  13 ++++-
> > > 6 files changed, 203 insertions(+), 14 deletions(-)
> > > 
> > > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
> > > index dbf07051aacd..6f7f00a35190 100644
> > > --- a/fs/ceph/addr.c
> > > +++ b/fs/ceph/addr.c
> > > @@ -450,8 +450,8 @@ static int ceph_readpages(struct file *file, struct address_space *mapping,
> > > 		goto out;
> > > 
> > > 	max = fsc->mount_options->rsize >> PAGE_SHIFT;
> > > -	dout("readpages %p file %p nr_pages %d max %d\n",
> > > -	     inode, file, nr_pages, max);
> > > +	dout("readpages %p file %p ctx %p nr_pages %d max %d\n",
> > > +	     inode, file, rw_ctx, nr_pages, max);
> > > 	while (!list_empty(page_list)) {
> > > 		rc = start_read(inode, page_list, max);
> > > 		if (rc < 0)
> > > diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c
> > > index c6ec5aa46100..76a428749981 100644
> > > --- a/fs/ceph/inode.c
> > > +++ b/fs/ceph/inode.c
> > > @@ -542,14 +542,19 @@ void ceph_destroy_inode(struct inode *inode)
> > > 	 */
> > > 	if (ci->i_snap_realm) {
> > > 		struct ceph_mds_client *mdsc =
> > > -			ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc;
> > > -		struct ceph_snap_realm *realm = ci->i_snap_realm;
> > > -
> > > -		dout(" dropping residual ref to snap realm %p\n", realm);
> > > -		spin_lock(&realm->inodes_with_caps_lock);
> > > -		list_del_init(&ci->i_snap_realm_item);
> > > -		spin_unlock(&realm->inodes_with_caps_lock);
> > > -		ceph_put_snap_realm(mdsc, realm);
> > > +					ceph_inode_to_client(inode)->mdsc;
> > > +		if (ceph_snap(inode) == CEPH_NOSNAP) {
> > > +			struct ceph_snap_realm *realm = ci->i_snap_realm;
> > > +			dout(" dropping residual ref to snap realm %p\n",
> > > +			     realm);
> > > +			spin_lock(&realm->inodes_with_caps_lock);
> > > +			list_del_init(&ci->i_snap_realm_item);
> > > +			spin_unlock(&realm->inodes_with_caps_lock);
> > > +			ceph_put_snap_realm(mdsc, realm);
> > > +		} else {
> > > +			ceph_put_snapid_map(mdsc, ci->i_snapid_map);
> > > +		}
> > > +		ci->i_snap_realm = NULL;
> > > 	}
> > > 
> > > 	kfree(ci->i_symlink);
> > > @@ -763,6 +768,9 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
> > > 		pool_ns = ceph_find_or_create_string(iinfo->pool_ns_data,
> > > 						     iinfo->pool_ns_len);
> > > 
> > > +	if (ceph_snap(inode) != CEPH_NOSNAP && !ci->i_snapid_map)
> > > +		ci->i_snapid_map = ceph_get_snapid_map(mdsc, ceph_snap(inode));
> > > +
> > > 	spin_lock(&ci->i_ceph_lock);
> > > 
> > > 	/*
> > > @@ -2241,10 +2249,11 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
> > > 	if (!err) {
> > > 		generic_fillattr(inode, stat);
> > > 		stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
> > > -		if (ceph_snap(inode) != CEPH_NOSNAP)
> > > -			stat->dev = ceph_snap(inode);
> > > +		if (ceph_snap(inode) == CEPH_NOSNAP)
> > > +			stat->dev = inode->i_sb->s_dev;
> > > 		else
> > > -			stat->dev = 0;
> > > +			stat->dev = ci->i_snapid_map ? ci->i_snapid_map->dev : 0;
> > > +
> > > 		if (S_ISDIR(inode->i_mode)) {
> > > 			if (ceph_test_mount_opt(ceph_sb_to_client(inode->i_sb),
> > > 						RBYTES))
> > > diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
> > > index 251dc44d84a0..b6e62f5d97c1 100644
> > > --- a/fs/ceph/mds_client.c
> > > +++ b/fs/ceph/mds_client.c
> > > @@ -3504,6 +3504,8 @@ static void delayed_work(struct work_struct *work)
> > > 	dout("mdsc delayed_work\n");
> > > 	ceph_check_delayed_caps(mdsc);
> > > 
> > > +	ceph_trim_snapid_map(mdsc);
> > > +
> > > 	mutex_lock(&mdsc->mutex);
> > > 	renew_interval = mdsc->mdsmap->m_session_timeout >> 2;
> > > 	renew_caps = time_after_eq(jiffies, HZ*renew_interval +
> > > @@ -3604,6 +3606,10 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
> > > 	ceph_caps_init(mdsc);
> > > 	ceph_adjust_min_caps(mdsc, fsc->min_caps);
> > > 
> > > +	spin_lock_init(&mdsc->snapid_map_lock);
> > > +	mdsc->snapid_map_tree = RB_ROOT;
> > > +	INIT_LIST_HEAD(&mdsc->snapid_map_lru);
> > > +
> > > 	init_rwsem(&mdsc->pool_perm_rwsem);
> > > 	mdsc->pool_perm_tree = RB_ROOT;
> > > 
> > > @@ -3797,6 +3803,8 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
> > > 	WARN_ON(!list_empty(&mdsc->cap_delay_list));
> > > 	mutex_unlock(&mdsc->mutex);
> > > 
> > > +	ceph_cleanup_snapid_map(mdsc);
> > > +
> > > 	ceph_cleanup_empty_realms(mdsc);
> > > 
> > > 	cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
> > > diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
> > > index 837ac4b087a0..87506a8fc1c3 100644
> > > --- a/fs/ceph/mds_client.h
> > > +++ b/fs/ceph/mds_client.h
> > > @@ -294,6 +294,15 @@ struct ceph_pool_perm {
> > > 	char pool_ns[];
> > > };
> > > 
> > > +struct ceph_snapid_map {
> > > +	struct rb_node node;
> > > +	struct list_head lru;
> > > +	atomic_t ref;
> > > +	u64 snap;
> > > +	dev_t dev;
> > > +	unsigned long last_used;
> > > +};
> > > +
> > > /*
> > >  * mds client state
> > >  */
> > > @@ -368,6 +377,10 @@ struct ceph_mds_client {
> > > 	struct list_head  dentry_lru;
> > > 	int		  num_dentry;
> > > 
> > > +	spinlock_t		snapid_map_lock;
> > > +	struct rb_root		snapid_map_tree;
> > > +	struct list_head	snapid_map_lru;
> > > +
> > > 	struct rw_semaphore     pool_perm_rwsem;
> > > 	struct rb_root		pool_perm_tree;
> > > 
> > > diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
> > > index 8a2ca41e4b97..0af9ea832f03 100644
> > > --- a/fs/ceph/snap.c
> > > +++ b/fs/ceph/snap.c
> > > @@ -979,3 +979,151 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
> > > 		up_write(&mdsc->snap_rwsem);
> > > 	return;
> > > }
> > > +
> > > +struct ceph_snapid_map* ceph_get_snapid_map(struct ceph_mds_client *mdsc,
> > > +					    u64 snap)
> > > +{
> > > +	struct ceph_snapid_map *sm, *exist;
> > > +	struct rb_node **p, *parent;
> > > +	int ret;
> > > +
> > > +	exist = NULL;
> > > +	spin_lock(&mdsc->snapid_map_lock);
> > > +	p = &mdsc->snapid_map_tree.rb_node;
> > > +	while (*p) {
> > > +		exist = rb_entry(*p, struct ceph_snapid_map, node);
> > > +		if (snap > exist->snap) {
> > > +			p = &(*p)->rb_left;
> > > +		} else if (snap < exist->snap) {
> > > +			p = &(*p)->rb_right;
> > > +		} else {
> > > +			atomic_inc(&exist->ref);
> > > +			list_del_init(&exist->lru);
> > > +			break;
> > > +		}
> > > +		exist = NULL;
> > > +	}
> > > +	spin_unlock(&mdsc->snapid_map_lock);
> > > +	if (exist) {
> > > +		dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
> > > +		return exist;
> > > +	}
> > > +
> > > +	sm = kmalloc(sizeof(*sm), GFP_NOFS);
> > > +	if (!sm)
> > > +		return NULL;
> > > +
> > > +	ret = get_anon_bdev(&sm->dev);
> > > +	if (ret < 0) {
> > > +		kfree(sm);
> > > +		return NULL;
> > > +	}	
> > > +
> > > +	INIT_LIST_HEAD(&sm->lru);
> > > +	atomic_set(&sm->ref, 1);
> > > +	sm->snap = snap;
> > > +
> > > +	exist = NULL;
> > > +	parent = NULL;
> > > +	p = &mdsc->snapid_map_tree.rb_node;
> > > +	spin_lock(&mdsc->snapid_map_lock);
> > > +	while (*p) {
> > > +		parent = *p;
> > > +		exist = rb_entry(*p, struct ceph_snapid_map, node);
> > > +		if (snap > exist->snap)
> > > +			p = &(*p)->rb_left;
> > > +		else if (snap < exist->snap)
> > > +			p = &(*p)->rb_right;
> > > +		else
> > > +			break;
> > > +		exist = NULL;
> > > +	}
> > > +	if (exist) {
> > > +		atomic_inc(&exist->ref);
> > > +		list_del_init(&exist->lru);
> > > +	} else {
> > > +		rb_link_node(&sm->node, parent, p);
> > > +		rb_insert_color(&sm->node, &mdsc->snapid_map_tree);
> > > +	}
> > > +	spin_unlock(&mdsc->snapid_map_lock);
> > > +	if (exist) {
> > > +		free_anon_bdev(sm->dev);
> > > +		kfree(sm);
> > > +		dout("found snapid map %llx -> %x\n", exist->snap, exist->dev);
> > > +		return exist;
> > > +	}
> > > +		
> > > +	dout("create snapid map %llx -> %x\n", sm->snap, sm->dev);
> > > +	return sm;
> > > +}
> > > +
> > > +void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
> > > +			 struct ceph_snapid_map *sm)
> > > +{
> > > +	if (!sm)
> > > +		return;
> > > +	if (atomic_dec_and_lock(&sm->ref, &mdsc->snapid_map_lock)) {
> > > +		sm->last_used = jiffies;
> > > +		list_add_tail(&sm->lru, &mdsc->snapid_map_lru);
> > > +		spin_unlock(&mdsc->snapid_map_lock);
> > > +	}
> > > +}
> > > +
> > > +void ceph_trim_snapid_map(struct ceph_mds_client *mdsc)
> > > +{
> > > +	struct ceph_snapid_map *sm;
> > > +	unsigned long now;
> > > +	LIST_HEAD(to_free);
> > > +
> > > +	/* unused map expires after an hour */
> > > +	static unsigned timeout = 60 * 60 * HZ;
> > > +
> > > +	spin_lock(&mdsc->snapid_map_lock);
> > > +	now = jiffies;
> > > +
> > > +	while (!list_empty(&mdsc->snapid_map_lru)) {
> > > +		sm = list_first_entry(&mdsc->snapid_map_lru,
> > > +				      struct ceph_snapid_map, lru);
> > > +		if (time_after(sm->last_used + timeout, now))
> > > +			break;
> > > +
> > > +		rb_erase(&sm->node, &mdsc->snapid_map_tree);
> > > +		list_move(&sm->lru, &to_free);
> > > +	}
> > > +	spin_unlock(&mdsc->snapid_map_lock);
> > > +
> > > +	while (!list_empty(&to_free)) {
> > > +		sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
> > > +		list_del(&sm->lru);
> > > +		dout("trim snapid map %llx -> %x\n", sm->snap, sm->dev);
> > > +		free_anon_bdev(sm->dev);
> > > +		kfree(sm);
> > > +	}
> > > +}
> > > +
> > > +void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc)
> > > +{
> > > +	struct ceph_snapid_map *sm;
> > > +	struct rb_node *p;
> > > +	LIST_HEAD(to_free);
> > > +
> > > +	spin_lock(&mdsc->snapid_map_lock);
> > > +	while ((p = rb_first(&mdsc->snapid_map_tree))) {
> > > +		sm = rb_entry(p, struct ceph_snapid_map, node);
> > > +		rb_erase(p, &mdsc->snapid_map_tree);
> > > +		list_move(&sm->lru, &to_free);
> > > +	}
> > > +	spin_unlock(&mdsc->snapid_map_lock);
> > > +
> > > +	while (!list_empty(&to_free)) {
> > > +		sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
> > > +		list_del(&sm->lru);
> > > +		free_anon_bdev(sm->dev);
> > > +		if (atomic_read(&sm->ref)) {
> > > +			pr_err("snapid map %llx -> %x still in use\n",
> > > +			       sm->snap, sm->dev);
> > > +			continue;
> > > +		}
> > 
> > This looks problematic. If it's still in use, then you'll call list_del
> > and free_anon_bdev on it, but not kfree it. What will eventually clean
> > it up, and how will it know not to do the list_del and free_anon_bdev a
> > second time?
> 
> this is for unexpected condition. don’t have a elegant way to handle. how about:
>         
> while (list_empty(&to_free)) {
>                 sm = list_first_entry(&to_free, struct ceph_snapid_map, lru);
>                 list_del(&sm->lru);
>                 free_anon_bdev(sm->dev);
>                 if (WARN_ON_ONCE(atomic_read(&sm->ref))) {
>                         pr_err("snapid map %llx -> %x still in use\n",
>                                sm->snap, sm->dev);
>                 }
>                 kfree(sm);
> }
> 

That might work. Another thing you could do is ensure you
list_del_init(&sm->lru), and then when you go to put the last reference
to the thing check whether that list_head is empty before you do the
cleanup.

IOW, use the list_head being empty as a mark that the resources held by
this object are already cleaned up and you needn't do anything but kfree
it.

> > 
> > > +		kfree(sm);
> > > +	}
> > > +}
> > > diff --git a/fs/ceph/super.h b/fs/ceph/super.h
> > > index 75701c199b2b..453edca41a37 100644
> > > --- a/fs/ceph/super.h
> > > +++ b/fs/ceph/super.h
> > > @@ -362,7 +362,10 @@ struct ceph_inode_info {
> > > 	struct list_head i_unsafe_iops;   /* uncommitted mds inode ops */
> > > 	spinlock_t i_unsafe_lock;
> > > 
> > > -	struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
> > > +	union {
> > > +		struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
> > > +		struct ceph_snapid_map *i_snapid_map; /* snapid -> dev_t */
> > > +	};
> > > 	int i_snap_realm_counter; /* snap realm (if caps) */
> > > 	struct list_head i_snap_realm_item;
> > > 	struct list_head i_snap_flush_item;
> > > @@ -778,6 +781,14 @@ extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
> > > 				  struct ceph_cap_snap *capsnap);
> > > extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
> > > 
> > > +extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc,
> > > +						   u64 snap);
> > > +extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc,
> > > +				struct ceph_snapid_map *sm);
> > > +extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc);
> > > +extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc);
> > > +
> > > +
> > > /*
> > >  * a cap_snap is "pending" if it is still awaiting an in-progress
> > >  * sync write (that may/may not still update size, mtime, etc.).
> > 
> > -- 
> > Jeff Layton <jlayton@xxxxxxxxxx>
> 
> 

-- 
Jeff Layton <jlayton@xxxxxxxxxx>
--
To unsubscribe from this list: send the line "unsubscribe ceph-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html