On 2023/5/8 9:40, Coly Li wrote: > > >> 2023年5月8日 09:30,Li Xiao Keng <lixiaokeng@xxxxxxxxxx> 写道: >> >> ping >> >> On 2023/4/23 9:30, Li Xiao Keng wrote: >>> ping >>> >>> On 2023/4/17 22:01, Li Xiao Keng wrote: >>>> When we add a new disk to a raid, it may return -EBUSY. >>>> >>>> The main process of --add: >>>> 1. dev_open >>>> 2. store_super1(st, di->fd) in write_init_super1 >>>> 3. fsync(di->fd) in write_init_super1 >>>> 4. close(di->fd) >>>> 5. ioctl(ADD_NEW_DISK) >>>> >>>> However, there will be some udev(change) event after step4. Then >>>> "/usr/sbin/mdadm --incremental ..." will be run, and the new disk >>>> will be add to md device. After that, ioctl will return -EBUSY. >>>> > > Hi Xiao Keng, > > The above description of the race is not informative enough, I am aware exactly how the race is from, therefore I am not able to response for the fix. > > Coly Li > ------ There is a raid1 with sda and sdb. And we add sdc to this raid, it may return -EBUSY. The main process of --add: 1. dev_open(sdc) in Manage_add 2. store_super1(st, di->fd) in write_init_super1 3. fsync(fd) in store_super1 4. close(di->fd) in write_init_super1 5. ioctl(ADD_NEW_DISK) Step 2 and 3 will add sdc to metadata of raid1. There will be udev(change of sdc) event after step4. Then "/usr/sbin/mdadm --incremental --export $devnode --offroot $env{DEVLINKS}" will be run, and the sdc will be added to the raid1. Then step 5 will return -EBUSY because it checks if device isn't claimed in md_import_device()->lock_rdev()->blkdev_get_by_dev() ->blkdev_get(). It will be confusing for users because sdc is added first time. The "incremental" will get map_lock before add sdc to raid1. So we add map_lock before write_init_super in "mdadm --add" to fix the race of "add" and "incremental". ------ I'm sorry for no detailed description. Do you have any advice of the new description? > > > >>>> Here we add map_lock before write_init_super in "mdadm --add" >>>> to fix this race. >>>> >>>> Signed-off-by: Li Xiao Keng <lixiaokeng@xxxxxxxxxx> >>>> Signed-off-by: Guanqin Miao <miaoguanqin@xxxxxxxxxx> >>>> --- >>>> Assemble.c | 5 ++++- >>>> Manage.c | 25 +++++++++++++++++-------- >>>> 2 files changed, 21 insertions(+), 9 deletions(-) >>>> >>>> diff --git a/Assemble.c b/Assemble.c >>>> index 49804941..086890ed 100644 >>>> --- a/Assemble.c >>>> +++ b/Assemble.c >>>> @@ -1479,8 +1479,11 @@ try_again: >>>> * to our list. We flag them so that we don't try to re-add, >>>> * but can remove if they turn out to not be wanted. >>>> */ >>>> - if (map_lock(&map)) >>>> + if (map_lock(&map)) { >>>> pr_err("failed to get exclusive lock on mapfile - continue anyway...\n"); >>>> + return 1; >>>> + } >>>> + >>>> if (c->update == UOPT_UUID) >>>> mp = NULL; >>>> else >>>> diff --git a/Manage.c b/Manage.c >>>> index f54de7c6..6a101bae 100644 >>>> --- a/Manage.c >>>> +++ b/Manage.c >>>> @@ -703,6 +703,7 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv, >>>> struct supertype *dev_st; >>>> int j; >>>> mdu_disk_info_t disc; >>>> + struct map_ent *map = NULL; >>>> >>>> if (!get_dev_size(tfd, dv->devname, &ldsize)) { >>>> if (dv->disposition == 'M') >>>> @@ -900,6 +901,10 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv, >>>> disc.raid_disk = 0; >>>> } >>>> >>>> + if (map_lock(&map)) { >>>> + pr_err("failed to get exclusive lock on mapfile when add disk\n"); >>>> + return -1; >>>> + } >>>> if (array->not_persistent==0) { >>>> int dfd; >>>> if (dv->disposition == 'j') >>>> @@ -911,9 +916,9 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv, >>>> dfd = dev_open(dv->devname, O_RDWR | O_EXCL|O_DIRECT); >>>> if (tst->ss->add_to_super(tst, &disc, dfd, >>>> dv->devname, INVALID_SECTORS)) >>>> - return -1; >>>> + goto unlock; >>>> if (tst->ss->write_init_super(tst)) >>>> - return -1; >>>> + goto unlock; >>>> } else if (dv->disposition == 'A') { >>>> /* this had better be raid1. >>>> * As we are "--re-add"ing we must find a spare slot >>>> @@ -971,14 +976,14 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv, >>>> pr_err("add failed for %s: could not get exclusive access to container\n", >>>> dv->devname); >>>> tst->ss->free_super(tst); >>>> - return -1; >>>> + goto unlock; >>>> } >>>> >>>> /* Check if metadata handler is able to accept the drive */ >>>> if (!tst->ss->validate_geometry(tst, LEVEL_CONTAINER, 0, 1, NULL, >>>> 0, 0, dv->devname, NULL, 0, 1)) { >>>> close(container_fd); >>>> - return -1; >>>> + goto unlock; >>>> } >>>> >>>> Kill(dv->devname, NULL, 0, -1, 0); >>>> @@ -987,7 +992,7 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv, >>>> dv->devname, INVALID_SECTORS)) { >>>> close(dfd); >>>> close(container_fd); >>>> - return -1; >>>> + goto unlock; >>>> } >>>> if (!mdmon_running(tst->container_devnm)) >>>> tst->ss->sync_metadata(tst); >>>> @@ -998,7 +1003,7 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv, >>>> dv->devname); >>>> close(container_fd); >>>> tst->ss->free_super(tst); >>>> - return -1; >>>> + goto unlock; >>>> } >>>> sra->array.level = LEVEL_CONTAINER; >>>> /* Need to set data_offset and component_size */ >>>> @@ -1013,7 +1018,7 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv, >>>> pr_err("add new device to external metadata failed for %s\n", dv->devname); >>>> close(container_fd); >>>> sysfs_free(sra); >>>> - return -1; >>>> + goto unlock; >>>> } >>>> ping_monitor(devnm); >>>> sysfs_free(sra); >>>> @@ -1027,7 +1032,7 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv, >>>> else >>>> pr_err("add new device failed for %s as %d: %s\n", >>>> dv->devname, j, strerror(errno)); >>>> - return -1; >>>> + goto unlock; >>>> } >>>> if (dv->disposition == 'j') { >>>> pr_err("Journal added successfully, making %s read-write\n", devname); >>>> @@ -1038,7 +1043,11 @@ int Manage_add(int fd, int tfd, struct mddev_dev *dv, >>>> } >>>> if (verbose >= 0) >>>> pr_err("added %s\n", dv->devname); >>>> + map_unlock(&map); >>>> return 1; >>>> +unlock: >>>> + map_unlock(&map); >>>> + return -1; >>>> } >>>> >>>> int Manage_remove(struct supertype *tst, int fd, struct mddev_dev *dv, >>>> > > . >