Hi Thanks for reviewing this. I applied your patch, the new version is here: http://people.redhat.com/mpatocka/patches/kernel/dm-lock-optimization/ Mikulas On Thu, 10 May 2012, Jun'ichi Nomura wrote: > Hi Mikulas, > > On 05/02/12 11:17, Mikulas Patocka wrote: > > I placed the new code using srcu here: > > http://people.redhat.com/mpatocka/patches/kernel/dm-lock-optimization/ > > > > It removes io_lock, map_lock and holders and replaces them with srcu. > > I've reviewed the patches and here are some comments. > The 1st one seems to be a bug. > Others are minor comments about readability. > > - There are functions destroying inactive table > without SRCU synchronization. > * table_load > * table_clear > * do_resume > * __hash_remove > It could cause use-after-free by the user of > dm_get_inactive_table(). > synchronization is needed, outside of exclusive hash_lock. > > - I couldn't see the reason why locking is different > for request-based in dm_wq_work(). > > - We could use dm_get_live_table() in the caller of > __split_and_process_bio() and pass the result to it. > Then, naked use of srcu_read_lock/unlock and > rcu_dereference can be eliminated. > I think it helps readability. > > - md->map is directly referenced in dm_suspend/dm_resume. > It's ok because md->suspend_lock is taken and nobody > can replace md->map. I think it's worth putting a comment > in 'struct mapped_device' about the rule. > > Attached patch explains the above comments by code. > > >> synchronize_rcu could be put in dm_table_destroy() instead of __bind(). > >> I think it's safer place to wait. > > > > I think the code is more readable if synchronizing rcu is just after > > assigning the pointer that is protected by rcu. > > OK. I don't object. > > Thanks, > --- > Jun'ichi Nomura, NEC Corporation > > * Added a comment about locking for reading md->map > * dm_sync_table() to wait for other processes to exit from > read-side critical section > * __split_and_process_bio() takes map > * __hash_remove() returns a table pointer to be destroyed later > * Added dm_sync_table() in functions in dm-ioctl.c to synchronize with > inactive table users > > Index: linux-3.3/drivers/md/dm.c > =================================================================== > --- linux-3.3.orig/drivers/md/dm.c 2012-05-10 12:19:10.977242964 +0900 > +++ linux-3.3/drivers/md/dm.c 2012-05-10 13:54:30.150853855 +0900 > @@ -141,6 +141,8 @@ struct mapped_device { > > /* > * The current mapping. > + * Use dm_get_live_table{_fast} or take suspend_lock for > + * dereference. > */ > struct dm_table *map; > > @@ -561,6 +563,12 @@ void dm_put_live_table(struct mapped_dev > srcu_read_unlock(&md->io_barrier, srcu_idx); > } > > +void dm_sync_table(struct mapped_device *md) > +{ > + synchronize_srcu(&md->io_barrier); > + synchronize_rcu_expedited(); > +} > + > /* > * A fast alternative to dm_get_live_table/dm_put_live_table. > * The caller must not block between these two functions. > @@ -1316,17 +1324,18 @@ static int __clone_and_map(struct clone_ > /* > * Split the bio into several clones and submit it to targets. > */ > -static void __split_and_process_bio(struct mapped_device *md, struct bio *bio) > +static void __split_and_process_bio(struct mapped_device *md, > + struct dm_table *map, struct bio *bio) > { > struct clone_info ci; > int error = 0; > > - ci.map = srcu_dereference(md->map, &md->io_barrier); > - if (unlikely(!ci.map)) { > + if (unlikely(!map)) { > bio_io_error(bio); > return; > } > > + ci.map = map; > ci.md = md; > ci.io = alloc_io(md); > ci.io->error = 0; > @@ -1422,8 +1431,9 @@ static void _dm_request(struct request_q > struct mapped_device *md = q->queuedata; > int cpu; > int srcu_idx; > + struct dm_table *map; > > - srcu_idx = srcu_read_lock(&md->io_barrier); > + map = dm_get_live_table(md, &srcu_idx); > > cpu = part_stat_lock(); > part_stat_inc(cpu, &dm_disk(md)->part0, ios[rw]); > @@ -1432,7 +1442,7 @@ static void _dm_request(struct request_q > > /* if we're suspended, we have to queue this io for later */ > if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags))) { > - srcu_read_unlock(&md->io_barrier, srcu_idx); > + dm_put_live_table(md, srcu_idx); > > if (bio_rw(bio) != READA) > queue_io(md, bio); > @@ -1441,8 +1451,8 @@ static void _dm_request(struct request_q > return; > } > > - __split_and_process_bio(md, bio); > - srcu_read_unlock(&md->io_barrier, srcu_idx); > + __split_and_process_bio(md, map, bio); > + dm_put_live_table(md, srcu_idx); > return; > } > > @@ -2115,8 +2125,7 @@ static struct dm_table *__bind(struct ma > set_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); > else > clear_bit(DMF_MERGE_IS_OPTIONAL, &md->flags); > - synchronize_srcu(&md->io_barrier); > - synchronize_rcu_expedited(); > + dm_sync_table(md); > > return old_map; > } > @@ -2133,8 +2142,7 @@ static struct dm_table *__unbind(struct > > dm_table_event_callback(map, NULL, NULL); > rcu_assign_pointer(md->map, NULL); > - synchronize_srcu(&md->io_barrier); > - synchronize_rcu_expedited(); > + dm_sync_table(md); > > return map; > } > @@ -2375,8 +2383,9 @@ static void dm_wq_work(struct work_struc > work); > struct bio *c; > int srcu_idx; > + struct dm_table *map; > > - srcu_idx = srcu_read_lock(&md->io_barrier); > + map = dm_get_live_table(md, &srcu_idx); > > while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) { > spin_lock_irq(&md->deferred_lock); > @@ -2386,15 +2395,13 @@ static void dm_wq_work(struct work_struc > if (!c) > break; > > - if (dm_request_based(md)) { > - srcu_read_unlock(&md->io_barrier, srcu_idx); > + if (dm_request_based(md)) > generic_make_request(c); > - srcu_idx = srcu_read_lock(&md->io_barrier); > - } else > - __split_and_process_bio(md, c); > + else > + __split_and_process_bio(md, map, c); > } > > - srcu_read_unlock(&md->io_barrier, srcu_idx); > + dm_put_live_table(md, srcu_idx); > } > > static void dm_queue_flush(struct mapped_device *md) > Index: linux-3.3/drivers/md/dm-ioctl.c > =================================================================== > --- linux-3.3.orig/drivers/md/dm-ioctl.c 2012-05-10 12:19:10.995242964 +0900 > +++ linux-3.3/drivers/md/dm-ioctl.c 2012-05-10 12:54:30.312180811 +0900 > @@ -250,7 +250,7 @@ static int dm_hash_insert(const char *na > return -EBUSY; > } > > -static void __hash_remove(struct hash_cell *hc) > +static struct dm_table *__hash_remove(struct hash_cell *hc) > { > struct dm_table *table; > int srcu_idx; > @@ -267,10 +267,13 @@ static void __hash_remove(struct hash_ce > dm_table_event(table); > dm_put_live_table(hc->md, srcu_idx); > > + table = NULL; > if (hc->new_map) > - dm_table_destroy(hc->new_map); > + table = hc->new_map; > dm_put(hc->md); > free_cell(hc); > + > + return table; > } > > static void dm_hash_remove_all(int keep_open_devices) > @@ -278,6 +281,7 @@ static void dm_hash_remove_all(int keep_ > int i, dev_skipped; > struct hash_cell *hc; > struct mapped_device *md; > + struct dm_table *t; > > retry: > dev_skipped = 0; > @@ -295,10 +299,14 @@ retry: > continue; > } > > - __hash_remove(hc); > + t = __hash_remove(hc); > > up_write(&_hash_lock); > > + if (t) { > + dm_sync_table(md); > + dm_table_destroy(t); > + } > dm_put(md); > if (likely(keep_open_devices)) > dm_destroy(md); > @@ -808,6 +816,7 @@ static int dev_remove(struct dm_ioctl *p > struct hash_cell *hc; > struct mapped_device *md; > int r; > + struct dm_table *t; > > down_write(&_hash_lock); > hc = __find_device_hash_cell(param); > @@ -831,9 +840,14 @@ static int dev_remove(struct dm_ioctl *p > return r; > } > > - __hash_remove(hc); > + t = __hash_remove(hc); > up_write(&_hash_lock); > > + if (t) { > + dm_sync_table(md); > + dm_table_destroy(t); > + } > + > if (!dm_kobject_uevent(md, KOBJ_REMOVE, param->event_nr)) > param->flags |= DM_UEVENT_GENERATED_FLAG; > > @@ -997,6 +1011,7 @@ static int do_resume(struct dm_ioctl *pa > > old_map = dm_swap_table(md, new_map); > if (IS_ERR(old_map)) { > + dm_sync_table(md); > dm_table_destroy(new_map); > dm_put(md); > return PTR_ERR(old_map); > @@ -1014,6 +1029,10 @@ static int do_resume(struct dm_ioctl *pa > param->flags |= DM_UEVENT_GENERATED_FLAG; > } > > + /* > + * Since dm_swap_table synchronizes RCU, nobody should be in > + * read-side critical section already. > + */ > if (old_map) > dm_table_destroy(old_map); > > @@ -1225,7 +1244,7 @@ static int table_load(struct dm_ioctl *p > { > int r; > struct hash_cell *hc; > - struct dm_table *t; > + struct dm_table *t, *old_map = NULL; > struct mapped_device *md; > struct target_type *immutable_target_type; > > @@ -1281,14 +1300,14 @@ static int table_load(struct dm_ioctl *p > hc = dm_get_mdptr(md); > if (!hc || hc->md != md) { > DMWARN("device has been removed from the dev hash table."); > - dm_table_destroy(t); > up_write(&_hash_lock); > + dm_table_destroy(t); > r = -ENXIO; > goto out; > } > > if (hc->new_map) > - dm_table_destroy(hc->new_map); > + old_map = hc->new_map; > hc->new_map = t; > up_write(&_hash_lock); > > @@ -1296,6 +1315,11 @@ static int table_load(struct dm_ioctl *p > __dev_status(md, param); > > out: > + if (old_map) { > + dm_sync_table(md); > + dm_table_destroy(old_map); > + } > + > dm_put(md); > > return r; > @@ -1305,6 +1329,7 @@ static int table_clear(struct dm_ioctl * > { > struct hash_cell *hc; > struct mapped_device *md; > + struct dm_table *old_map = NULL; > > down_write(&_hash_lock); > > @@ -1316,7 +1341,7 @@ static int table_clear(struct dm_ioctl * > } > > if (hc->new_map) { > - dm_table_destroy(hc->new_map); > + old_map = hc->new_map; > hc->new_map = NULL; > } > > @@ -1325,6 +1350,10 @@ static int table_clear(struct dm_ioctl * > __dev_status(hc->md, param); > md = hc->md; > up_write(&_hash_lock); > + if (old_map) { > + dm_sync_table(md); > + dm_table_destroy(old_map); > + } > dm_put(md); > > return 0; > Index: linux-3.3/include/linux/device-mapper.h > =================================================================== > --- linux-3.3.orig/include/linux/device-mapper.h 2012-05-10 10:03:06.000000000 +0900 > +++ linux-3.3/include/linux/device-mapper.h 2012-05-10 12:45:48.510196182 +0900 > @@ -364,6 +364,7 @@ int dm_table_complete(struct dm_table *t > */ > struct dm_table *dm_get_live_table(struct mapped_device *md, int *srcu_idx); > void dm_put_live_table(struct mapped_device *md, int srcu_idx); > +void dm_sync_table(struct mapped_device *md); > > /* > * Queries > -- dm-devel mailing list dm-devel@xxxxxxxxxx https://www.redhat.com/mailman/listinfo/dm-devel