under my test, sometimes the bios dispatched to the dm-mirror's bi_size is larger then the dm-mirror target's region size, here some bi_size when i run bonnie++ on a dm-mirror target(mkfs as ext3), dm-mirror's region size is 2048 bytes: ......... *************************bi_size(in bytes) - bi_sector Jun 24 15:07:28 darkstar kernel: Catch it:49152 - 1060816 Jun 24 15:07:28 darkstar kernel: Catch it:49152 - 1070040 Jun 24 15:07:28 darkstar kernel: Catch it:32768 - 1150960 Jun 24 15:07:28 darkstar kernel: Catch it:32768 - 1160184 Jun 24 15:07:28 darkstar kernel: Catch it:49152 - 1176552 Jun 24 15:07:28 darkstar kernel: Catch it:49152 - 1185768 Jun 24 15:07:28 darkstar kernel: Catch it:49152 - 1225704 Jun 24 15:07:28 darkstar kernel: Catch it:49152 - 1234912 Jun 24 15:07:28 darkstar kernel: Catch it:32768 - 1258464 Jun 24 15:07:28 darkstar kernel: Catch it:32768 - 1291256 Jun 24 15:09:14 darkstar kernel: Catch it:131072 - 1920960 Jun 24 15:09:15 darkstar kernel: Catch it:131072 - 1921984 ...................... the debug code insert to function queue_bio(struct mirror_set *ms, struct bio *bio, int rw)'s first line: + if( (bio->bi_sector>>ms->rh.region_shift) + != ((bio->bi_sector+((bio->bi_size-1)>>9))>> ms->rh.region_shift) ) + { + printk("Catch it:%d - %d\n",bio->bi_size,bio->bi_sector); + } this problem's keypoint is READ/WRITE request to dm-mirror could affect not only one region, but in current implementation it assumes only one region is affected and locked, others are not locked. for example, [__________________________MIRROR_1____________________________] [__________________________MIRROR_2____________________________] +---------------------+--------------------+-----------------+----------------------+ | sync region | recovering region| nosync region | nosync region | +---------------------+--------------------+-----------------+----------------------+ ^ recover working here now. <--------------------------READ bio range------------------------------------------> In this situation, READ could read nosync mirrror then return corrupted data. WRITE has similar problem with it. Sincerely, Zhao Qian <zhaoqian@xxxxxxxxxxx> Du Jun <dujun@xxxxxxxxxxx> Patch to solve this problem here: [root@darkstar src]# diff -u orig/linux-2.6.11/drivers/md/dm-raid1.c linux-2.6.11.11/drivers/md/dm-raid1.c --- orig/linux-2.6.11/drivers/md/dm-raid1.c 2005-03-02 15:38:33.000000000 +0800 +++ linux-2.6.11.11/drivers/md/dm-raid1.c 2005-06-27 11:58:08.000000000 +0800 @@ -76,6 +76,7 @@ /* hash table */ rwlock_t hash_lock; mempool_t *region_pool; + mempool_t *region_pair_pool; unsigned int mask; unsigned int nr_buckets; struct list_head *buckets; @@ -85,6 +86,9 @@ struct list_head clean_regions; struct list_head quiesced_regions; struct list_head recovered_regions; + + spinlock_t region_pair_lock; + struct list_head unused_region_pairs; }; enum { @@ -132,7 +136,32 @@ kfree(element); } +struct region_pair +{ + region_t first; + region_t last; + struct list_head list; +}; + +static struct region_pair *bio_to_region_pair(struct region_hash *rh, struct bio *bio,struct region_pair *rp) +{ + rp->first = bio->bi_sector >> rh->region_shift; + rp->last = (bio->bi_sector+((bio->bi_size-1)>>9)) >> rh->region_shift; + return rp; +} + +static void *region_pair_alloc(int gfp_mask, void *pool_data) +{ + return kmalloc(sizeof(struct region_pair), gfp_mask); +} + +static void region_pair_free(void *element, void *pool_data) +{ + kfree(element); +} + #define MIN_REGIONS 64 +#define MIN_REGION_PAIRS 128 #define MAX_RECOVERY 1 static int rh_init(struct region_hash *rh, struct mirror_set *ms, struct dirty_log *log, uint32_t region_size, @@ -173,6 +202,9 @@ INIT_LIST_HEAD(&rh->quiesced_regions); INIT_LIST_HEAD(&rh->recovered_regions); + spin_lock_init(&rh->region_pair_lock); + INIT_LIST_HEAD(&rh->unused_region_pairs); + rh->region_pool = mempool_create(MIN_REGIONS, region_alloc, region_free, NULL); if (!rh->region_pool) { @@ -181,6 +213,15 @@ return -ENOMEM; } + rh->region_pair_pool = mempool_create(MIN_REGION_PAIRS, region_pair_alloc, + region_pair_free, NULL); + if (!rh->region_pair_pool) { + vfree(rh->buckets); + rh->buckets = NULL; + mempool_destroy(rh->region_pool); + return -ENOMEM; + } + return 0; } @@ -188,6 +229,7 @@ { unsigned int h; struct region *reg, *nreg; + struct region_pair *pair,*pair_next; BUG_ON(!list_empty(&rh->quiesced_regions)); for (h = 0; h < rh->nr_buckets; h++) { @@ -197,10 +239,15 @@ } } + list_for_each_entry_safe( pair,pair_next,&rh->unused_region_pairs,list) + mempool_free(pair,rh->region_pair_pool); + if (rh->log) dm_destroy_dirty_log(rh->log); if (rh->region_pool) mempool_destroy(rh->region_pool); + if (rh->region_pair_pool) + mempool_destroy(rh->region_pair_pool); vfree(rh->buckets); } @@ -320,9 +367,11 @@ static void rh_update_states(struct region_hash *rh) { struct region *reg, *next; + struct region_pair *pair,*pair_next; LIST_HEAD(clean); LIST_HEAD(recovered); + LIST_HEAD(unused); /* * Quickly grab the lists. @@ -347,6 +396,12 @@ list_del(®->hash_list); } spin_unlock(&rh->region_lock); + + spin_lock(&rh->region_pair_lock); + list_splice(&rh->unused_region_pairs,&unused); + INIT_LIST_HEAD(&rh->unused_region_pairs); + spin_unlock(&rh->region_pair_lock); + write_unlock_irq(&rh->hash_lock); /* @@ -367,6 +422,9 @@ list_for_each_entry_safe (reg, next, &clean, list) mempool_free(reg, rh->region_pool); + + list_for_each_entry_safe( pair,pair_next,&unused,list) + mempool_free(pair,rh->region_pair_pool); } static void rh_inc(struct region_hash *rh, region_t region) @@ -391,9 +449,15 @@ static void rh_inc_pending(struct region_hash *rh, struct bio_list *bios) { struct bio *bio; + struct region_pair rp; + region_t r; - for (bio = bios->head; bio; bio = bio->bi_next) - rh_inc(rh, bio_to_region(rh, bio)); + for (bio = bios->head; bio; bio = bio->bi_next){ + bio_to_region_pair(rh,bio,&rp); + for( r=rp.first; r<=rp.last;r++ ){ + rh_inc(rh, r); + } + } } static void rh_dec(struct region_hash *rh, region_t region) @@ -506,14 +570,14 @@ rh->log->type->flush(rh->log); } -static void rh_delay(struct region_hash *rh, struct bio *bio) +static void rh_delay_on_region(struct region_hash *rh, struct bio *bio,region_t region) { - struct region *reg; + struct region *reg; - read_lock(&rh->hash_lock); - reg = __rh_find(rh, bio_to_region(rh, bio)); - bio_list_add(®->delayed_bios, bio); - read_unlock(&rh->hash_lock); + read_lock(&rh->hash_lock); + reg = __rh_find(rh, region); + bio_list_add(®->delayed_bios, bio); + read_unlock(&rh->hash_lock); } static void rh_stop_recovery(struct region_hash *rh) @@ -692,17 +756,26 @@ static void do_reads(struct mirror_set *ms, struct bio_list *reads) { - region_t region; struct bio *bio; struct mirror *m; + struct region_pair rp; + region_t r; + int sync_count; while ((bio = bio_list_pop(reads))) { - region = bio_to_region(&ms->rh, bio); + bio_to_region_pair(&ms->rh,bio,&rp); /* * We can only read balance if the region is in sync. */ - if (rh_in_sync(&ms->rh, region, 0)) + + sync_count = 0; + for(r=rp.first;r<=rp.last;r++){ + if (rh_in_sync(&ms->rh, r, 0)){ + sync_count++; + } + } + if( sync_count>(rp.last-rp.first) ) m = choose_mirror(ms, bio->bi_sector); else m = ms->mirror + DEFAULT_MIRROR; @@ -779,6 +852,8 @@ int state; struct bio *bio; struct bio_list sync, nosync, recover, *this_list = NULL; + struct region_pair rp; + region_t r; if (!writes->head) return; @@ -791,23 +866,28 @@ bio_list_init(&recover); while ((bio = bio_list_pop(writes))) { - state = rh_state(&ms->rh, bio_to_region(&ms->rh, bio), 1); - switch (state) { - case RH_CLEAN: - case RH_DIRTY: - this_list = &sync; - break; - - case RH_NOSYNC: - this_list = &nosync; - break; + bio_to_region_pair(&ms->rh,bio,&rp); + this_list = &sync; + for(r=rp.first;r<=rp.last;r++){ + state = rh_state(&ms->rh, r, 1); + switch (state) { + case RH_CLEAN: + case RH_DIRTY: + break; - case RH_RECOVERING: - this_list = &recover; - break; + case RH_NOSYNC: + this_list = &nosync; + break; + + case RH_RECOVERING: + rh_delay_on_region(&ms->rh,bio,r); + this_list = &recover; + break; + } + if( &recover == this_list )break; } - bio_list_add(this_list, bio); + if( &recover!=this_list )bio_list_add(this_list, bio); } /* @@ -825,9 +905,6 @@ while ((bio = bio_list_pop(&sync))) do_write(ms, bio); - while ((bio = bio_list_pop(&recover))) - rh_delay(&ms->rh, bio); - while ((bio = bio_list_pop(&nosync))) { map_bio(ms, ms->mirror + DEFAULT_MIRROR, bio); generic_make_request(bio); @@ -1105,9 +1182,10 @@ struct mirror *m; struct mirror_set *ms = ti->private; - map_context->ll = bio->bi_sector >> ms->rh.region_shift; - if (rw == WRITE) { + struct region_pair *rp = mempool_alloc(ms->rh.region_pair_pool, GFP_NOIO); + bio_to_region_pair(&ms->rh,bio,rp); + map_context->ptr = rp; queue_bio(ms, bio, rw); return 0; } @@ -1128,7 +1206,7 @@ if (!r && rw == READA) return -EIO; - if (!r) { + if ( !r || bio_sectors(bio)>1 ) { /* Pass this io over to the daemon */ queue_bio(ms, bio, rw); return 0; @@ -1145,15 +1223,21 @@ static int mirror_end_io(struct dm_target *ti, struct bio *bio, int error, union map_info *map_context) { + region_t region; int rw = bio_rw(bio); struct mirror_set *ms = (struct mirror_set *) ti->private; - region_t region = map_context->ll; /* * We need to dec pending if this was a write. */ - if (rw == WRITE) - rh_dec(&ms->rh, region); + if (rw == WRITE){ + struct region_pair *rp = (struct region_pair *)map_context->ptr; + for(region=rp->first;region<=rp->last;region++) + rh_dec(&ms->rh, region); + spin_lock_irq(&ms->rh.region_pair_lock); + list_add(&rp->list,&ms->rh.unused_region_pairs); + spin_unlock_irq(&ms->rh.region_pair_lock); + } return 0; }