On Sat, Mar 22, 2014 at 11:44:46AM +0100, Jiri Slaby wrote: > From: Joe Thornber <ejt@xxxxxxxxxx> > > 3.12-stable review patch. If anyone has any objections, please let me know. This patch is tagged for stable 3.13+ so I'm not sure it's suitable for a 3.12 kernel. Cheers, -- Luís > =============== > > commit cebc2de44d3bce53e46476e774126c298ca2c8a9 upstream. > > This has been a relatively long-standing issue that wasn't nailed down > until Teng-Feng Yang's meticulous bug report to dm-devel on 3/7/2014, > see: http://www.redhat.com/archives/dm-devel/2014-March/msg00021.html > > From that report: > "When decreasing the reference count of a metadata block with its > reference count equals 3, we will call dm_btree_remove() to remove > this enrty from the B+tree which keeps the reference count info in > metadata device. > > The B+tree will try to rebalance the entry of the child nodes in each > node it traversed, and the rebalance process contains the following > steps. > > (1) Finding the corresponding children in current node (shadow_current(s)) > (2) Shadow the children block (issue BOP_INC) > (3) redistribute keys among children, and free children if necessary (issue BOP_DEC) > > Since the update of a metadata block's reference count could be > recursive, we will stash these reference count update operations in > smm->uncommitted and then process them in a FILO fashion. > > The problem is that step(3) could free the children which is created > in step(2), so the BOP_DEC issued in step(3) will be carried out > before the BOP_INC issued in step(2) since these BOPs will be > processed in FILO fashion. Once the BOP_DEC from step(3) tries to > decrease the reference count of newly shadow block, it will report > failure for its reference equals 0 before decreasing. It looks like we > can solve this issue by processing these BOPs in a FIFO fashion > instead of FILO." > > Commit 5b564d80 ("dm space map: disallow decrementing a reference count > below zero") changed the code to report an error for this temporary > refcount decrement below zero. So what was previously a harmless > invalid refcount became a hard failure due to the new error path: > > device-mapper: space map common: unable to decrement a reference count below 0 > device-mapper: thin: 253:6: dm_thin_insert_block() failed: error = -22 > device-mapper: thin: 253:6: switching pool to read-only mode > > This bug is in dm persistent-data code that is common to the DM thin and > cache targets. So any users of those targets should apply this fix. > > Fix this by applying recursive space map operations in FIFO order rather > than FILO. > > Resolves: https://bugzilla.kernel.org/show_bug.cgi?id=68801 > > Reported-by: Apollon Oikonomopoulos <apoikos@xxxxxxxxxx> > Reported-by: edwillam1007@xxxxxxxxx > Reported-by: Teng-Feng Yang <shinrairis@xxxxxxxxx> > Signed-off-by: Joe Thornber <ejt@xxxxxxxxxx> > Signed-off-by: Mike Snitzer <snitzer@xxxxxxxxxx> > Signed-off-by: Jiri Slaby <jslaby@xxxxxxx> > --- > drivers/md/persistent-data/dm-space-map-metadata.c | 113 +++++++++++++++++---- > 1 file changed, 92 insertions(+), 21 deletions(-) > > diff --git a/drivers/md/persistent-data/dm-space-map-metadata.c b/drivers/md/persistent-data/dm-space-map-metadata.c > index afb419e514bf..579b58200bf2 100644 > --- a/drivers/md/persistent-data/dm-space-map-metadata.c > +++ b/drivers/md/persistent-data/dm-space-map-metadata.c > @@ -91,6 +91,69 @@ struct block_op { > dm_block_t block; > }; > > +struct bop_ring_buffer { > + unsigned begin; > + unsigned end; > + struct block_op bops[MAX_RECURSIVE_ALLOCATIONS + 1]; > +}; > + > +static void brb_init(struct bop_ring_buffer *brb) > +{ > + brb->begin = 0; > + brb->end = 0; > +} > + > +static bool brb_empty(struct bop_ring_buffer *brb) > +{ > + return brb->begin == brb->end; > +} > + > +static unsigned brb_next(struct bop_ring_buffer *brb, unsigned old) > +{ > + unsigned r = old + 1; > + return (r >= (sizeof(brb->bops) / sizeof(*brb->bops))) ? 0 : r; > +} > + > +static int brb_push(struct bop_ring_buffer *brb, > + enum block_op_type type, dm_block_t b) > +{ > + struct block_op *bop; > + unsigned next = brb_next(brb, brb->end); > + > + /* > + * We don't allow the last bop to be filled, this way we can > + * differentiate between full and empty. > + */ > + if (next == brb->begin) > + return -ENOMEM; > + > + bop = brb->bops + brb->end; > + bop->type = type; > + bop->block = b; > + > + brb->end = next; > + > + return 0; > +} > + > +static int brb_pop(struct bop_ring_buffer *brb, struct block_op *result) > +{ > + struct block_op *bop; > + > + if (brb_empty(brb)) > + return -ENODATA; > + > + bop = brb->bops + brb->begin; > + result->type = bop->type; > + result->block = bop->block; > + > + brb->begin = brb_next(brb, brb->begin); > + > + return 0; > +} > + > +/*----------------------------------------------------------------*/ > + > struct sm_metadata { > struct dm_space_map sm; > > @@ -101,25 +164,20 @@ struct sm_metadata { > > unsigned recursion_count; > unsigned allocated_this_transaction; > - unsigned nr_uncommitted; > - struct block_op uncommitted[MAX_RECURSIVE_ALLOCATIONS]; > + struct bop_ring_buffer uncommitted; > > struct threshold threshold; > }; > > static int add_bop(struct sm_metadata *smm, enum block_op_type type, dm_block_t b) > { > - struct block_op *op; > + int r = brb_push(&smm->uncommitted, type, b); > > - if (smm->nr_uncommitted == MAX_RECURSIVE_ALLOCATIONS) { > + if (r) { > DMERR("too many recursive allocations"); > return -ENOMEM; > } > > - op = smm->uncommitted + smm->nr_uncommitted++; > - op->type = type; > - op->block = b; > - > return 0; > } > > @@ -158,11 +216,17 @@ static int out(struct sm_metadata *smm) > return -ENOMEM; > } > > - if (smm->recursion_count == 1 && smm->nr_uncommitted) { > - while (smm->nr_uncommitted && !r) { > - smm->nr_uncommitted--; > - r = commit_bop(smm, smm->uncommitted + > - smm->nr_uncommitted); > + if (smm->recursion_count == 1) { > + while (!brb_empty(&smm->uncommitted)) { > + struct block_op bop; > + > + r = brb_pop(&smm->uncommitted, &bop); > + if (r) { > + DMERR("bug in bop ring buffer"); > + break; > + } > + > + r = commit_bop(smm, &bop); > if (r) > break; > } > @@ -217,7 +281,8 @@ static int sm_metadata_get_nr_free(struct dm_space_map *sm, dm_block_t *count) > static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, > uint32_t *result) > { > - int r, i; > + int r; > + unsigned i; > struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); > unsigned adjustment = 0; > > @@ -225,8 +290,10 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, > * We may have some uncommitted adjustments to add. This list > * should always be really short. > */ > - for (i = 0; i < smm->nr_uncommitted; i++) { > - struct block_op *op = smm->uncommitted + i; > + for (i = smm->uncommitted.begin; > + i != smm->uncommitted.end; > + i = brb_next(&smm->uncommitted, i)) { > + struct block_op *op = smm->uncommitted.bops + i; > > if (op->block != b) > continue; > @@ -254,7 +321,8 @@ static int sm_metadata_get_count(struct dm_space_map *sm, dm_block_t b, > static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, > dm_block_t b, int *result) > { > - int r, i, adjustment = 0; > + int r, adjustment = 0; > + unsigned i; > struct sm_metadata *smm = container_of(sm, struct sm_metadata, sm); > uint32_t rc; > > @@ -262,8 +330,11 @@ static int sm_metadata_count_is_more_than_one(struct dm_space_map *sm, > * We may have some uncommitted adjustments to add. This list > * should always be really short. > */ > - for (i = 0; i < smm->nr_uncommitted; i++) { > - struct block_op *op = smm->uncommitted + i; > + for (i = smm->uncommitted.begin; > + i != smm->uncommitted.end; > + i = brb_next(&smm->uncommitted, i)) { > + > + struct block_op *op = smm->uncommitted.bops + i; > > if (op->block != b) > continue; > @@ -671,7 +742,7 @@ int dm_sm_metadata_create(struct dm_space_map *sm, > smm->begin = superblock + 1; > smm->recursion_count = 0; > smm->allocated_this_transaction = 0; > - smm->nr_uncommitted = 0; > + brb_init(&smm->uncommitted); > threshold_init(&smm->threshold); > > memcpy(&smm->sm, &bootstrap_ops, sizeof(smm->sm)); > @@ -713,7 +784,7 @@ int dm_sm_metadata_open(struct dm_space_map *sm, > smm->begin = 0; > smm->recursion_count = 0; > smm->allocated_this_transaction = 0; > - smm->nr_uncommitted = 0; > + brb_init(&smm->uncommitted); > threshold_init(&smm->threshold); > > memcpy(&smm->old_ll, &smm->ll, sizeof(smm->old_ll)); > -- > 1.9.0 > > -- > To unsubscribe from this list: send the line "unsubscribe stable" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe stable" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html