Re: [PATCH 3.12 157/175] dm space map metadata: fix refcount decrement below 0 which caused corruption

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Sat, Mar 22, 2014 at 11:44:46AM +0100, Jiri Slaby wrote:
> From: Joe Thornber <ejt@xxxxxxxxxx>
> 
> 3.12-stable review patch.  If anyone has any objections, please let me know.

This patch is tagged for stable 3.13+ so I'm not sure it's suitable for a
3.12 kernel.

Cheers,
--
Luís

> ===============
> 
> commit cebc2de44d3bce53e46476e774126c298ca2c8a9 upstream.
> 
> This has been a relatively long-standing issue that wasn't nailed down
> until Teng-Feng Yang's meticulous bug report to dm-devel on 3/7/2014,
> see: http://www.redhat.com/archives/dm-devel/2014-March/msg00021.html
> 
> From that report:
>   "When decreasing the reference count of a metadata block with its
>   reference count equals 3, we will call dm_btree_remove() to remove
>   this enrty from the B+tree which keeps the reference count info in
>   metadata device.
> 
>   The B+tree will try to rebalance the entry of the child nodes in each
>   node it traversed, and the rebalance process contains the following
>   steps.
> 
>   (1) Finding the corresponding children in current node (shadow_current(s))
>   (2) Shadow the children block (issue BOP_INC)
>   (3) redistribute keys among children, and free children if necessary (issue BOP_DEC)
> 
>   Since the update of a metadata block's reference count could be
>   recursive, we will stash these reference count update operations in
>   smm->uncommitted and then process them in a FILO fashion.
> 
>   The problem is that step(3) could free the children which is created
>   in step(2), so the BOP_DEC issued in step(3) will be carried out
>   before the BOP_INC issued in step(2) since these BOPs will be
>   processed in FILO fashion. Once the BOP_DEC from step(3) tries to
>   decrease the reference count of newly shadow block, it will report
>   failure for its reference equals 0 before decreasing. It looks like we
>   can solve this issue by processing these BOPs in a FIFO fashion
>   instead of FILO."
> 
> Commit 5b564d80 ("dm space map: disallow decrementing a reference count
> below zero") changed the code to report an error for this temporary
> refcount decrement below zero.  So what was previously a harmless
> invalid refcount became a hard failure due to the new error path:
> 
>  device-mapper: space map common: unable to decrement a reference count below 0
>  device-mapper: thin: 253:6: dm_thin_insert_block() failed: error = -22
>  device-mapper: thin: 253:6: switching pool to read-only mode
> 
> This bug is in dm persistent-data code that is common to the DM thin and
> cache targets.  So any users of those targets should apply this fix.
> 
> Fix this by applying recursive space map operations in FIFO order rather
> than FILO.
> 
> Resolves: https://bugzilla.kernel.org/show_bug.cgi?id=68801
> 
> Reported-by: Apollon Oikonomopoulos <apoikos@xxxxxxxxxx>
> Reported-by: edwillam1007@xxxxxxxxx
> Reported-by: Teng-Feng Yang <shinrairis@xxxxxxxxx>
> Signed-off-by: Joe Thornber <ejt@xxxxxxxxxx>
> Signed-off-by: Mike Snitzer <snitzer@xxxxxxxxxx>
> Signed-off-by: Jiri Slaby <jslaby@xxxxxxx>
> ---
>  drivers/md/persistent-data/dm-space-map-metadata.c | 113 +++++++++++++++++----
>  1 file changed, 92 insertions(+), 21 deletions(-)
> 
> diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c
> index afb419e514bf..579b58200bf2 100644
> --- a/drivers/md/persistent-data/dm-space-map-metadata.c
> +++ b/drivers/md/persistent-data/dm-space-map-metadata.c
> @@ -91,6 +91,69 @@ struct block_op {
>  	dm_block_t block;
>  };
>  
> +struct bop_ring_buffer {
> +	unsigned begin;
> +	unsigned end;
> +	struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1];
> +};
> +
> +static void brb_init(struct bop_ring_buffer *brb)
> +{
> +	brb->begin = 0;
> +	brb->end = 0;
> +}
> +
> +static bool brb_empty(struct bop_ring_buffer *brb)
> +{
> +	return brb->begin == brb->end;
> +}
> +
> +static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old)
> +{
> +	unsigned r = old + 1;
> +	return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r;
> +}
> +
> +static int brb_push(struct bop_ring_buffer *brb,
> +		    enum block_op_type type, dm_block_t b)
> +{
> +	struct block_op *bop;
> +	unsigned next = brb_next(brb, brb->end);
> +
> +	/*
> +	 * We don't allow the last bop to be filled, this way we can
> +	 * differentiate between full and empty.
> +	 */
> +	if (next == brb->begin)
> +		return -ENOMEM;
> +
> +	bop = brb->bops + brb->end;
> +	bop->type = type;
> +	bop->block = b;
> +
> +	brb->end = next;
> +
> +	return 0;
> +}
> +
> +static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result)
> +{
> +	struct block_op *bop;
> +
> +	if (brb_empty(brb))
> +		return -ENODATA;
> +
> +	bop = brb->bops + brb->begin;
> +	result->type = bop->type;
> +	result->block = bop->block;
> +
> +	brb->begin = brb_next(brb, brb->begin);
> +
> +	return 0;
> +}
> +
> +/*----------------------------------------------------------------*/
> +
>  struct sm_metadata {
>  	struct dm_space_map sm;
>  
> @@ -101,25 +164,20 @@ struct sm_metadata {
>  
>  	unsigned recursion_count;
>  	unsigned allocated_this_transaction;
> -	unsigned nr_uncommitted;
> -	struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS];
> +	struct bop_ring_buffer uncommitted;
>  
>  	struct threshold threshold;
>  };
>  
>  static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b)
>  {
> -	struct block_op *op;
> +	int r = brb_push(&smm->uncommitted, type, b);
>  
> -	if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) {
> +	if (r) {
>  		DMERR("too many recursive allocations");
>  		return -ENOMEM;
>  	}
>  
> -	op = smm->uncommitted + smm->nr_uncommitted++;
> -	op->type = type;
> -	op->block = b;
> -
>  	return 0;
>  }
>  
> @@ -158,11 +216,17 @@ static int out(struct sm_metadata *smm)
>  		return -ENOMEM;
>  	}
>  
> -	if (smm->recursion_count == 1 && smm->nr_uncommitted) {
> -		while (smm->nr_uncommitted && !r) {
> -			smm->nr_uncommitted--;
> -			r = commit_bop(smm, smm->uncommitted +
> -				       smm->nr_uncommitted);
> +	if (smm->recursion_count == 1) {
> +		while (!brb_empty(&smm->uncommitted)) {
> +			struct block_op bop;
> +
> +			r = brb_pop(&smm->uncommitted, &bop);
> +			if (r) {
> +				DMERR("bug in bop ring buffer");
> +				break;
> +			}
> +
> +			r = commit_bop(smm, &bop);
>  			if (r)
>  				break;
>  		}
> @@ -217,7 +281,8 @@ static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count)
>  static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
>  				 uint32_t *result)
>  {
> -	int r, i;
> +	int r;
> +	unsigned i;
>  	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
>  	unsigned adjustment = 0;
>  
> @@ -225,8 +290,10 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
>  	 * We may have some uncommitted adjustments to add.  This list
>  	 * should always be really short.
>  	 */
> -	for (i = 0; i < smm->nr_uncommitted; i++) {
> -		struct block_op *op = smm->uncommitted + i;
> +	for (i = smm->uncommitted.begin;
> +	     i != smm->uncommitted.end;
> +	     i = brb_next(&smm->uncommitted, i)) {
> +		struct block_op *op = smm->uncommitted.bops + i;
>  
>  		if (op->block != b)
>  			continue;
> @@ -254,7 +321,8 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b,
>  static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
>  					      dm_block_t b, int *result)
>  {
> -	int r, i, adjustment = 0;
> +	int r, adjustment = 0;
> +	unsigned i;
>  	struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm);
>  	uint32_t rc;
>  
> @@ -262,8 +330,11 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm,
>  	 * We may have some uncommitted adjustments to add.  This list
>  	 * should always be really short.
>  	 */
> -	for (i = 0; i < smm->nr_uncommitted; i++) {
> -		struct block_op *op = smm->uncommitted + i;
> +	for (i = smm->uncommitted.begin;
> +	     i != smm->uncommitted.end;
> +	     i = brb_next(&smm->uncommitted, i)) {
> +
> +		struct block_op *op = smm->uncommitted.bops + i;
>  
>  		if (op->block != b)
>  			continue;
> @@ -671,7 +742,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm,
>  	smm->begin = superblock + 1;
>  	smm->recursion_count = 0;
>  	smm->allocated_this_transaction = 0;
> -	smm->nr_uncommitted = 0;
> +	brb_init(&smm->uncommitted);
>  	threshold_init(&smm->threshold);
>  
>  	memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm));
> @@ -713,7 +784,7 @@ int dm_sm_metadata_open(struct dm_space_map *sm,
>  	smm->begin = 0;
>  	smm->recursion_count = 0;
>  	smm->allocated_this_transaction = 0;
> -	smm->nr_uncommitted = 0;
> +	brb_init(&smm->uncommitted);
>  	threshold_init(&smm->threshold);
>  
>  	memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll));
> -- 
> 1.9.0
> 
> --
> To unsubscribe from this list: send the line "unsubscribe stable" in
> the body of a message to majordomo@xxxxxxxxxxxxxxx
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe stable" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html




[Index of Archives]     [Linux Kernel]     [Kernel Development Newbies]     [Linux USB Devel]     [Video for Linux]     [Linux Audio Users]     [Yosemite Hiking]     [Linux Kernel]     [Linux SCSI]