Allow user to provision multiple blocks rather than an individual block when a given block needs to be allocated from the pool. This larger allocation unit is controlled with the 'blocks_per_allocation' feature. After hacking device-mapper-test-suite support for blocks_per_allocation the bulk of the tests pass. The few that are failing are due to expecting fewer blocks than are currently allocated (due to increased block allocation). I haven't written tests that try to showcase performance gains for certain workloads (e.g. reads after write). Before I do so I want to make sure I'm not missing something fundamental (which I likely am). There are a few FIXMEs in the new alloc_data_blocks() method that I could really use your insight on. Your review of my use of the bio-prison code in alloc_data_blocks() would be especially appreciated. Also, I'm hooking alloc_data_block() to perform the extra 'blocks_per_allocation' -- it calls alloc_data_blocks(). This seems right for non-snapshot use-cases but it runs counter to the original goal you had to effectively have 2 allocation units: 1) thinp allocation blocksize 2) snapshot blocksize. break_sharing() also uses alloc_data_block(): should it not be allowed to do multiple block allocations? --- drivers/md/dm-thin.c | 168 ++++++++++++++++++++++++++++++++++++++++++++----- 1 files changed, 150 insertions(+), 18 deletions(-) diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index 726228b..cbe61ff 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -145,6 +145,8 @@ struct pool_features { bool discard_enabled:1; bool discard_passdown:1; bool error_if_no_space:1; + + dm_block_t blocks_per_allocation; }; struct thin_c; @@ -619,6 +621,23 @@ static void process_prepared_mapping_fail(struct dm_thin_new_mapping *m) mempool_free(m, m->tc->pool->mapping_pool); } +static int commit_prepared_block(struct thin_c *tc, + dm_block_t virt_block, dm_block_t data_block) +{ + int r; + + /* + * Commit the prepared block into the mapping btree. + * Any I/O for this block arriving after this point will get + * remapped to it directly. + */ + r = dm_thin_insert_block(tc->td, virt_block, data_block); + if (r) + metadata_operation_failed(tc->pool, "dm_thin_insert_block", r); + + return r; +} + static void process_prepared_mapping(struct dm_thin_new_mapping *m) { struct thin_c *tc = m->tc; @@ -635,14 +654,8 @@ static void process_prepared_mapping(struct dm_thin_new_mapping *m) goto out; } - /* - * Commit the prepared block into the mapping btree. - * Any I/O for this block arriving after this point will get - * remapped to it directly. - */ - r = dm_thin_insert_block(tc->td, m->virt_block, m->data_block); + r = commit_prepared_block(tc, m->virt_block, m->data_block); if (r) { - metadata_operation_failed(pool, "dm_thin_insert_block", r); cell_error(pool, m->cell); goto out; } @@ -919,7 +932,91 @@ static void check_low_water_mark(struct pool *pool, dm_block_t free_blocks) } } -static int alloc_data_block(struct thin_c *tc, dm_block_t *result) +static dm_block_t get_first_allocation_group_aligned_block(dm_block_t virt_block, struct pool *pool) +{ + if (pool->pf.blocks_per_allocation == 1) + return virt_block; + + /* FIXME: improve efficiency here */ + (void) sector_div(virt_block, pool->pf.blocks_per_allocation); + return virt_block * pool->pf.blocks_per_allocation; +} + +static int alloc_data_blocks(struct thin_c *tc, dm_block_t bio_virt_block, dm_block_t *result) +{ + int r; + dm_block_t first_virt_block, last_virt_block, virt_block, data_block; + struct pool *pool = tc->pool; + struct dm_cell_key key; + struct dm_thin_lookup_result lookup_result; + + /* + * FIXME: should a new per-pool (or per-thin-dev?) multi-block mutex be used + * to limit the potential to interleave simulateous multi-block allocations? + */ + + first_virt_block = get_first_allocation_group_aligned_block(bio_virt_block, pool); + last_virt_block = first_virt_block + pool->pf.blocks_per_allocation - 1; + + /* + * Important to allocate data_blocks in-order, to have contiguous allocation + * from pool, e.g.: <bio-less blocks> <bio-full block> <more bio-less blocks> + */ + for (virt_block = first_virt_block; virt_block <= last_virt_block; virt_block++) { + struct dm_bio_prison_cell *cell = NULL; + + /* + * Must lock bio-less virt_block(s) in a cell for this thin device; + * the caller already has the bio-full @bio_virt_block locked in a cell. + */ + if (virt_block != bio_virt_block) { + build_virtual_key(tc->td, virt_block, &key); + if (bio_detain(pool, &key, NULL, &cell)) + continue; /* data block is already being allocated */ + + /* + * FIXME: if cell->holder is NULL then process_bio() must know to + * handle that case.. otherwise process_bio's "nothing further + * to do here" case when cell is already occupied will break!? + * - or does cell release already handle cell->holder = NULL + * but non-holder bios existing as inmates? + */ + + r = dm_thin_find_block(tc->td, virt_block, 1, &lookup_result); + if (r != -ENODATA) { + cell_defer(tc, cell); + continue; /* data block already exists */ + } + } + + r = dm_pool_alloc_data_block(pool->pmd, &data_block); + if (r) { + metadata_operation_failed(pool, "dm_pool_alloc_data_block", r); + if (cell) + cell_error(pool, cell); + return r; + } + + if (virt_block == bio_virt_block) { + /* process_prepared_mapping() will save data_block to mapping btree */ + *result = data_block; + continue; + } + + /* Must commit preallocated data_block to the thin device's mapping btree */ + r = commit_prepared_block(tc, virt_block, data_block); + if (r) { + cell_error(pool, cell); + return r; + } + + cell_defer(tc, cell); + } + + return 0; +} + +static int alloc_data_block(struct thin_c *tc, dm_block_t bio_virt_block, dm_block_t *result) { int r; dm_block_t free_blocks; @@ -957,6 +1054,9 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) } } + if (pool->pf.blocks_per_allocation > 1) + return alloc_data_blocks(tc, bio_virt_block, result); + r = dm_pool_alloc_data_block(pool->pmd, result); if (r) { metadata_operation_failed(pool, "dm_pool_alloc_data_block", r); @@ -1101,7 +1201,7 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, dm_block_t data_block; struct pool *pool = tc->pool; - r = alloc_data_block(tc, &data_block); + r = alloc_data_block(tc, block, &data_block); switch (r) { case 0: schedule_internal_copy(tc, block, lookup_result->block, @@ -1177,7 +1277,7 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block return; } - r = alloc_data_block(tc, &data_block); + r = alloc_data_block(tc, block, &data_block); switch (r) { case 0: if (tc->origin_dev) @@ -1726,6 +1826,8 @@ static void pool_features_init(struct pool_features *pf) pf->discard_enabled = true; pf->discard_passdown = true; pf->error_if_no_space = false; + + pf->blocks_per_allocation = 1; } static void __pool_destroy(struct pool *pool) @@ -1941,7 +2043,7 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, const char *arg_name; static struct dm_arg _args[] = { - {0, 4, "Invalid number of pool feature arguments"}, + {0, 6, "Invalid number of pool feature arguments"}, }; /* @@ -1973,6 +2075,20 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, else if (!strcasecmp(arg_name, "error_if_no_space")) pf->error_if_no_space = true; + else if (!strcasecmp(arg_name, "blocks_per_allocation")) { + dm_block_t allocation_blocks; + arg_name = dm_shift_arg(as); + argc--; + + if (kstrtoull(arg_name, 10, (unsigned long long *)&allocation_blocks)) { + ti->error = "Invalid blocks_per_allocation value"; + r = -EINVAL; + break; + } + + pf->blocks_per_allocation = allocation_blocks; + } + else { ti->error = "Unrecognised pool feature requested"; r = -EINVAL; @@ -2045,6 +2161,7 @@ static dm_block_t calc_metadata_threshold(struct pool_c *pt) * no_discard_passdown: don't pass discards down to the data device * read_only: Don't allow any changes to be made to the pool metadata. * error_if_no_space: error IOs, instead of queueing, if no space. + * blocks_per_allocation <blocks>: provision multiple blocks at once. */ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) { @@ -2214,17 +2331,27 @@ static int pool_map(struct dm_target *ti, struct bio *bio) return r; } +static dm_block_t pool_data_device_size(struct pool *pool, sector_t table_length) +{ + dm_block_t blocks = table_length; + + /* round length to a blocks_per_allocation multiple */ + (void) dm_sector_div64(blocks, pool->sectors_per_block * pool->pf.blocks_per_allocation); + blocks *= pool->pf.blocks_per_allocation; + + return blocks; +} + static int maybe_resize_data_dev(struct dm_target *ti, bool *need_commit) { int r; struct pool_c *pt = ti->private; struct pool *pool = pt->pool; - sector_t data_size = ti->len; - dm_block_t sb_data_size; + dm_block_t data_size, sb_data_size; *need_commit = false; - (void) sector_div(data_size, pool->sectors_per_block); + data_size = pool_data_device_size(pool, ti->len); r = dm_pool_get_data_dev_size(pool->pmd, &sb_data_size); if (r) { @@ -2562,7 +2689,7 @@ static void emit_flags(struct pool_features *pf, char *result, { unsigned count = !pf->zero_new_blocks + !pf->discard_enabled + !pf->discard_passdown + (pf->mode == PM_READ_ONLY) + - pf->error_if_no_space; + pf->error_if_no_space + 2*(pf->blocks_per_allocation > 1); DMEMIT("%u ", count); if (!pf->zero_new_blocks) @@ -2579,6 +2706,9 @@ static void emit_flags(struct pool_features *pf, char *result, if (pf->error_if_no_space) DMEMIT("error_if_no_space "); + + if (pf->blocks_per_allocation > 1) + DMEMIT("blocks_per_allocation %llu", pf->blocks_per_allocation); } /* @@ -2684,6 +2814,9 @@ static void pool_status(struct dm_target *ti, status_type_t type, else DMEMIT("queue_if_no_space "); + if (pool->pf.blocks_per_allocation > 1) + DMEMIT("blocks_per_allocation %llu ", pool->pf.blocks_per_allocation); + break; case STATUSTYPE_TABLE: @@ -3047,7 +3180,7 @@ err: static int thin_iterate_devices(struct dm_target *ti, iterate_devices_callout_fn fn, void *data) { - sector_t blocks; + dm_block_t blocks; struct thin_c *tc = ti->private; struct pool *pool = tc->pool; @@ -3058,8 +3191,7 @@ static int thin_iterate_devices(struct dm_target *ti, if (!pool->ti) return 0; /* nothing is bound */ - blocks = pool->ti->len; - (void) sector_div(blocks, pool->sectors_per_block); + blocks = pool_data_device_size(pool, pool->ti->len); if (blocks) return fn(ti, tc->pool_dev, 0, pool->sectors_per_block * blocks, data); -- dm-devel mailing list dm-devel@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/dm-devel