Here goes the updated patch. commit 74da1595eb711b77969275070cda7516bac36f5e Signed-off-by: Sandeep K Sinha <sandeepksinha@xxxxxxxxx> Date: Sat Jun 6 20:49:37 2009 +0530 Due to the lack of memory ordering guarantees, we may have races around mddev->conf. This patch addresses the same using rcu protection to avoid such race conditions. diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 9ad6ec4..a56095c 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -28,9 +28,11 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) { int lo, mid, hi; - linear_conf_t *conf = mddev_to_conf(mddev); + linear_conf_t *conf; + rcu_read_lock(); lo = 0; + conf = rcu_dereference(mddev->private); hi = mddev->raid_disks - 1; /* @@ -45,7 +47,7 @@ static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) else lo = mid + 1; } - + rcu_read_unlock(); return conf->disks + lo; } @@ -86,36 +88,49 @@ static int linear_mergeable_bvec(struct request_queue *q, static void linear_unplug(struct request_queue *q) { mddev_t *mddev = q->queuedata; - linear_conf_t *conf = mddev_to_conf(mddev); + linear_conf_t *conf; int i; + rcu_read_lock(); + conf = rcu_dereference(mddev->private); + for (i=0; i < mddev->raid_disks; i++) { struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev); blk_unplug(r_queue); } + rcu_read_unlock(); } static int linear_congested(void *data, int bits) { mddev_t *mddev = data; - linear_conf_t *conf = mddev_to_conf(mddev); + linear_conf_t *conf; int i, ret = 0; + rcu_read_lock(); + conf = rcu_dereference(mddev->private); + for (i = 0; i < mddev->raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); ret |= bdi_congested(&q->backing_dev_info, bits); } + + rcu_read_unlock(); return ret; } On Wed, Jun 17, 2009 at 12:05 PM, SandeepKsinha<sandeepksinha@xxxxxxxxx> wrote: > Neil, > > On Mon, Jun 15, 2009 at 4:26 AM, Neil Brown<neilb@xxxxxxx> wrote: >> >> Hi, >> Thanks for this patch, and sorry for the delay in reviewing it. >> >> I have a few issues: >> >> On Saturday June 6, sandeepksinha@xxxxxxxxx wrote: >>> Signed-off-by: Sandeep K Sinha <sandeepksinha@xxxxxxxxx> >>> >>> Protecting mddev with barriers to avoid races. >> >> 1/ You need a lot more of an explanatory comment than this. >> At least give some hint as to what the races are. >> Give than the rcu primitives are used, it now makes sense to use >> e.g. call_rcu to free the old 'conf'. That might reasonably be in a >> separate patch, but the comment on this patch should at least at that >> possibility. >>> > > Sure. I shall do it for the final patch. I will also take care of this > henceforth. > >>> diff --git a/drivers/md/linear.c b/drivers/md/linear.c >>> index 9ad6ec4..a56095c 100644 >>> --- a/drivers/md/linear.c >>> +++ b/drivers/md/linear.c >>> @@ -28,9 +28,11 @@ >>> static inline dev_info_t *which_dev(mddev_t *mddev, sector_t sector) >>> { >>> int lo, mid, hi; >>> - linear_conf_t *conf = mddev_to_conf(mddev); >>> + linear_conf_t *conf; >>> >>> + rcu_read_lock(); >>> lo = 0; >>> + conf = rcu_dereference(mddev->private); >>> hi = mddev->raid_disks - 1; >>> >> >> 2/ mddev->raid_disks should really be dereferenced before 'conf'. >> Doing it the way you have done it, the 'raid_disks' value could be >> larger than the value supported by the 'conf' so things could >> go wrong. >> > Agreed. I hope you are referring to the case where a disk is in the > process of being added to an array. Is that right ? > Kindly confirm. >> >>> /* >>> @@ -45,7 +47,7 @@ static inline dev_info_t *which_dev(mddev_t *mddev, >>> sector_t sector) >>> else >>> lo = mid + 1; >>> } >>> - >>> + rcu_read_unlock(); >>> return conf->disks + lo; >>> } >> >> 3/ We are accessing conf->disks well after the rcu_lock has been released. >> That is not exactly a problem with the code as it stands. But if >> we do go ahead and free the old 'conf' with call_rcu, then this >> becomes racy. >> We should hold the rcu_read_lock for the entire time that we are >> accessing the contents of 'conf'. >> > True. > >> That means we don't take rcu_read_lock in which_dev, but rather >> take it in the two functions that call which_dev. >> > > I have made that change. >>> >>> @@ -86,36 +88,49 @@ static int linear_mergeable_bvec(struct request_queue *q, >>> static void linear_unplug(struct request_queue *q) >>> { >>> mddev_t *mddev = q->queuedata; >>> - linear_conf_t *conf = mddev_to_conf(mddev); >>> + linear_conf_t *conf; >>> int i; >>> >>> + rcu_read_lock(); >>> + conf = rcu_dereference(mddev->private); >>> + >>> for (i=0; i < mddev->raid_disks; i++) { >>> struct request_queue *r_queue = bdev_get_queue(conf->disks[i].rdev->bdev); >>> blk_unplug(r_queue); >>> } >>> + rcu_read_unlock(); >>> } >>> >>> static int linear_congested(void *data, int bits) >>> { >>> mddev_t *mddev = data; >>> - linear_conf_t *conf = mddev_to_conf(mddev); >>> + linear_conf_t *conf; >>> int i, ret = 0; >>> >>> + rcu_read_lock(); >>> + conf = rcu_dereference(mddev->private); >>> + >>> for (i = 0; i < mddev->raid_disks && !ret ; i++) { >>> struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); >>> ret |= bdi_congested(&q->backing_dev_info, bits); >>> } >>> + >>> + rcu_read_unlock(); >>> return ret; >>> } >>> >>> static sector_t linear_size(mddev_t *mddev, sector_t sectors, int raid_disks) >>> { >>> - linear_conf_t *conf = mddev_to_conf(mddev); >>> - >>> + linear_conf_t *conf; >>> + sector_t array_sectors; >>> + rcu_read_lock(); >>> + conf = rcu_dereference(mddev->private); >>> WARN_ONCE(sectors || raid_disks, >>> "%s does not support generic reshape\n", __func__); >>> - >>> - return conf->array_sectors; >>> + array_sectors = conf->array_sectors; >>> + rcu_read_unlock(); >>> + >>> + return array_sectors; >>> } >>> >>> static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) >>> @@ -215,15 +230,14 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) >>> return -EINVAL; >>> >>> rdev->raid_disk = rdev->saved_raid_disk; >>> - >>> - newconf = linear_conf(mddev,mddev->raid_disks+1); >>> + newconf = linear_conf(mddev,mddev->raid_disks + 1); >>> >>> if (!newconf) >>> return -ENOMEM; >>> >>> newconf->prev = mddev_to_conf(mddev); >>> - mddev->private = newconf; >>> mddev->raid_disks++; >>> + rcu_assign_pointer(mddev->private,newconf); >>> md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); >>> set_capacity(mddev->gendisk, mddev->array_sectors); >>> return 0; >>> @@ -231,14 +245,17 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) >>> >>> static int linear_stop (mddev_t *mddev) >>> { >>> - linear_conf_t *conf = mddev_to_conf(mddev); >>> - >>> + linear_conf_t *conf; >>> + >>> + rcu_read_lock(); >>> + conf = rcu_dereference(mddev->private); >>> blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ >>> do { >>> linear_conf_t *t = conf->prev; >>> kfree(conf); >>> conf = t; >>> } while (conf); >>> + rcu_read_unlock(); >>> >> >> 4/ We don't need the rcu protection here as we hold ->reconfig_mutex >> both in linear_add and linear_stop, so they cannot race. >> Adding a comment to this effect might be a good idea though. >> > > Fine. Shall do this as well. > > The new patch will follow soon. > >> Thanks, >> >> NeilBrown >> >> >>> return 0; >>> } >>> -- >>> Regards, >>> Sandeep. >>> >>> >>> >>> >>> >>> >>> “To learn is to change. Education is a process that changes the learner.” >> > > > > -- > Regards, > Sandeep. > > > > > > > “To learn is to change. Education is a process that changes the learner.” > -- Regards, Sandeep. “To learn is to change. Education is a process that changes the learner.” -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html