On Sunday March 3, gody@master.slon.net wrote: > On Sun, 3 Mar 2002, Neil Brown wrote: > > > > Could you give me a detailed description of your setup: > > What raid arrays do you have and how are they configured? > > (cat /proc/mdstat might be enough) > > >From freshly booted computer: > > cat /proc/mdstat > > Personalities : [raid1] [raid5] > read_ahead 1024 sectors > md0 : active raid1 hdi1[2] hdg1[1] hde1[0] hda1[3] > 56128 blocks [4/4] [UUUU] > > md1 : active raid5 hdi2[2] hdg2[1] hde2[0] > 6152704 blocks level 5, 32k chunk, algorithm 0 [3/3] [UUU] > > md2 : active raid5 hdi3[2] hdg3[1] hde3[0] > 1076224 blocks level 5, 8k chunk, algorithm 0 [3/3] [UUU] > > unused devices: <none> > Thanks. However I still cannot reproduce the problem, or figure out any way that it could be happening... Here is another patch that you could try. It incorporates the previous patch and adds a bit more locking particularly around the list of md devices. NeilBrown --- ./include/linux/raid/md_k.h 2002/03/07 03:30:54 1.1 +++ ./include/linux/raid/md_k.h 2002/03/08 05:16:48 @@ -75,13 +75,6 @@ extern dev_mapping_t mddev_map [MAX_MD_DEVS]; -static inline mddev_t * kdev_to_mddev (kdev_t dev) -{ - if (MAJOR(dev) != MD_MAJOR) - BUG(); - return mddev_map[MINOR(dev)].mddev; -} - /* * options passed in raidrun: */ @@ -206,6 +199,7 @@ unsigned long resync_mark_cnt;/* blocks written at resync_mark */ char *name; int recovery_running; + int dying; struct semaphore reconfig_sem; struct semaphore recovery_sem; struct semaphore resync_sem; @@ -309,7 +303,20 @@ tmp = tmp->next, tmp->prev != &all_mddevs \ ; ) -static inline int lock_mddev (mddev_t * mddev) +#define ITERATE_MDDEV_LOCK(mddev,tmp) \ + \ + for (tmp = all_mddevs.next; \ + mddev = md_list_entry(tmp, mddev_t, all_mddevs), \ + tmp = tmp->next, tmp->prev != &all_mddevs \ + && (down(&mddev->reconfig_sem), 1) \ + ; up(&mddev->reconfig_sem)) if (!mddev->dying) + +static inline void lock_mddev (mddev_t * mddev) +{ + return down(&mddev->reconfig_sem); +} + +static inline int lock_mddev_interruptible (mddev_t * mddev) { return down_interruptible(&mddev->reconfig_sem); } --- ./drivers/md/md.c 2002/03/07 03:30:54 1.1 +++ ./drivers/md/md.c 2002/03/08 04:34:59 @@ -130,8 +130,26 @@ /* * Enables to iterate over all existing md arrays + * + * Locking rules: + * - access to all_mddevs requires all_mddevs_sem. + * - an mddev can be locked while all_mddevs_sem is held. + * - When removing an mddev, we + * lock the mddev + * check that ->active is 1 (us). + * set "dying" + * unlock the mddev + * claim all_mddevs_sem + * actually remove device + * release all_mddevs_sem + * - to get a reference to an mddev, we + * claim all_mddevs_sem + * find the mddev in the list + * check that it isn't "dying" + * increase ->active or take a lock */ static MD_LIST_HEAD(all_mddevs); +static DECLARE_MUTEX(all_mddevs_sem); /* * The mapping between kdev and mddev is not necessary a simple @@ -140,6 +158,57 @@ */ dev_mapping_t mddev_map[MAX_MD_DEVS]; + +static inline mddev_t * kdev_to_mddev (kdev_t dev) +{ + mddev_t *mddev; + if (MAJOR(dev) != MD_MAJOR) + BUG(); + down(&all_mddevs_sem); + mddev = mddev_map[MINOR(dev)].mddev; + if (mddev && !mddev->dying) + atomic_inc(&mddev->active); + else + mddev = NULL; + up(&all_mddevs_sem); + return mddev; +} + +static inline mddev_t * kdev_to_mddev_lock (kdev_t dev) +{ + mddev_t *mddev; + if (MAJOR(dev) != MD_MAJOR) + BUG(); + down(&all_mddevs_sem); + mddev = mddev_map[MINOR(dev)].mddev; + if (mddev) { + if (mddev->dying) + mddev = NULL; + else + lock_mddev(mddev); + } + up(&all_mddevs_sem); + return mddev; +} +static inline mddev_t * kdev_to_mddev_lock_interruptible (kdev_t dev, int *err) +{ + mddev_t *mddev; + if (MAJOR(dev) != MD_MAJOR) + BUG(); + down(&all_mddevs_sem); + mddev = mddev_map[MINOR(dev)].mddev; + *err = 0; + if (mddev) { + if (mddev->dying) { + *err = -EBUSY; + mddev = NULL; + } else + *err = lock_mddev_interruptible(mddev); + } + up(&all_mddevs_sem); + return mddev; +} + void add_mddev_mapping(mddev_t * mddev, kdev_t dev, void *data) { unsigned int minor = MINOR(dev); @@ -175,13 +244,19 @@ static int md_make_request(request_queue_t *q, int rw, struct buffer_head * bh) { mddev_t *mddev = kdev_to_mddev(bh->b_rdev); + int rv; if (mddev && mddev->pers) - return mddev->pers->make_request(mddev, rw, bh); + rv = mddev->pers->make_request(mddev, rw, bh); else { buffer_IO_error(bh); - return 0; + rv = 0; } + if (mddev) + /* should really drop count when request completes... */ + if (atomic_dec_and_test(&mddev->active)) + BUG(); + return rv; } static mddev_t * alloc_mddev(kdev_t dev) @@ -199,20 +274,22 @@ memset(mddev, 0, sizeof(*mddev)); mddev->__minor = MINOR(dev); - init_MUTEX(&mddev->reconfig_sem); + init_MUTEX_LOCKED(&mddev->reconfig_sem); init_MUTEX(&mddev->recovery_sem); init_MUTEX(&mddev->resync_sem); MD_INIT_LIST_HEAD(&mddev->disks); MD_INIT_LIST_HEAD(&mddev->all_mddevs); - atomic_set(&mddev->active, 0); + atomic_set(&mddev->active, 1); /* * The 'base' mddev is the one with data NULL. * personalities can create additional mddevs * if necessary. */ + down(&all_mddevs_sem); add_mddev_mapping(mddev, dev, 0); md_list_add(&mddev->all_mddevs, &all_mddevs); + up(&all_mddevs_sem); MOD_INC_USE_COUNT; @@ -720,18 +797,11 @@ md_size[mdidx(mddev)] = 0; md_hd_struct[mdidx(mddev)].nr_sects = 0; - /* - * Make sure nobody else is using this mddev - * (careful, we rely on the global kernel lock here) - */ - while (md_atomic_read(&mddev->resync_sem.count) != 1) - schedule(); - while (md_atomic_read(&mddev->recovery_sem.count) != 1) - schedule(); + down(&all_mddevs_sem); del_mddev_mapping(mddev, MKDEV(MD_MAJOR, mdidx(mddev))); md_list_del(&mddev->all_mddevs); - MD_INIT_LIST_HEAD(&mddev->all_mddevs); + up(&all_mddevs_sem); kfree(mddev); MOD_DEC_USE_COUNT; } @@ -802,7 +872,8 @@ printk("md: **********************************\n"); printk("md: * <COMPLETE RAID STATE PRINTOUT> *\n"); printk("md: **********************************\n"); - ITERATE_MDDEV(mddev,tmp) { + down(&all_mddevs_sem); + ITERATE_MDDEV_LOCK(mddev,tmp) { printk("md%d: ", mdidx(mddev)); ITERATE_RDEV(mddev,rdev,tmp2) @@ -817,6 +888,7 @@ ITERATE_RDEV(mddev,rdev,tmp2) print_rdev(rdev); } + up(&all_mddevs_sem); printk("md: **********************************\n"); printk("\n"); } @@ -899,10 +971,6 @@ MD_BUG(); return 1; } - if (rdev->faulty) { - MD_BUG(); - return 1; - } if (rdev->sb->md_magic != MD_SB_MAGIC) { MD_BUG(); return 1; @@ -1000,6 +1068,11 @@ struct md_list_head *tmp; mdk_rdev_t *rdev; + if (!mddev->dying && !down_trylock(&mddev->reconfig_sem)) { + up(&mddev->reconfig_sem); + BUG(); + } + if (!mddev->sb_dirty) { printk("hm, md_update_sb() called without ->sb_dirty == 1, from %p.\n", __builtin_return_address(0)); return 0; @@ -1034,8 +1107,11 @@ err = 0; ITERATE_RDEV(mddev,rdev,tmp) { printk(KERN_INFO "md: "); - if (rdev->faulty) + if (rdev->faulty) { printk("(skipping faulty "); + if (rdev->sb) + free_disk_sb(rdev); + } if (rdev->alias_device) printk("(skipping alias "); @@ -1812,6 +1888,10 @@ if (mddev->recovery_running) md_interrupt_thread(md_recovery_thread); + if (!ro) + mddev->dying = 1; /* make sure nobody tries to use this */ + unlock_mddev(mddev); + /* * This synchronizes with signal delivery to the * resync or reconstruction thread. It also nicely @@ -1833,6 +1913,7 @@ if (mddev->pers->stop(mddev)) { if (mddev->ro) set_device_ro(dev, 1); + mddev->dying = 0; OUT(-EBUSY); } if (mddev->ro) @@ -1850,8 +1931,10 @@ mddev->sb_dirty = 1; md_update_sb(mddev); } - if (ro) + if (ro) { set_device_ro(dev, 1); + lock_mddev(mddev); + } } /* @@ -1965,6 +2048,7 @@ mdidx(mddev), partition_name(rdev0->dev)); ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) export_rdev(rdev); + atomic_dec(&mddev->active); continue; } mddev = alloc_mddev(md_kdev); @@ -1972,8 +2056,6 @@ printk(KERN_ERR "md: cannot allocate memory for md drive.\n"); break; } - if (md_kdev == countdev) - atomic_inc(&mddev->active); printk(KERN_INFO "md: created md%d\n", mdidx(mddev)); ITERATE_RDEV_GENERIC(candidates,pending,rdev,tmp) { bind_rdev_to_array(rdev, mddev); @@ -1981,6 +2063,9 @@ MD_INIT_LIST_HEAD(&rdev->pending); } autorun_array(mddev); + unlock_mddev(mddev); + if (md_kdev != countdev) + atomic_dec(&mddev->active); } printk(KERN_INFO "md: ... autorun DONE.\n"); } @@ -2640,7 +2725,9 @@ * Commands creating/starting a new array: */ - mddev = kdev_to_mddev(dev); + mddev = kdev_to_mddev_lock_interruptible(dev, &err); + if (mddev == NULL && err) + goto abort; switch (cmd) { @@ -2662,17 +2749,6 @@ err = -ENOMEM; goto abort; } - atomic_inc(&mddev->active); - - /* - * alloc_mddev() should possibly self-lock. - */ - err = lock_mddev(mddev); - if (err) { - printk(KERN_WARNING "md: ioctl, reason %d, cmd %d\n", - err, cmd); - goto abort; - } if (mddev->sb) { printk(KERN_WARNING "md: array md%d already has a superblock!\n", @@ -2695,9 +2771,6 @@ goto done_unlock; case START_ARRAY: - /* - * possibly make it lock the array ... - */ err = autostart_array((kdev_t)arg, dev); if (err) { printk(KERN_WARNING "md: autostart %s failed!\n", @@ -2717,7 +2790,7 @@ err = -ENODEV; goto abort; } - err = lock_mddev(mddev); + if (err) { printk(KERN_INFO "md: ioctl lock interrupted, reason %d, cmd %d\n",err, cmd); goto abort; @@ -2885,17 +2958,17 @@ /* * Always succeed, but increment the usage count */ - mddev_t *mddev = kdev_to_mddev(inode->i_rdev); - if (mddev) - atomic_inc(&mddev->active); + kdev_to_mddev(inode->i_rdev); return (0); } static int md_release(struct inode *inode, struct file * file) { mddev_t *mddev = kdev_to_mddev(inode->i_rdev); - if (mddev) + if (mddev) { + atomic_dec(&mddev->active); atomic_dec(&mddev->active); + } return 0; } @@ -3056,7 +3129,6 @@ return 0; if (!mddev->pers->error_handler || mddev->pers->error_handler(mddev,rdev) <= 0) { - free_disk_sb(rrdev); rrdev->faulty = 1; } else return 1; @@ -3180,7 +3252,8 @@ else sz += sprintf(page+sz, "%d sectors\n", read_ahead[MD_MAJOR]); - ITERATE_MDDEV(mddev,tmp) { + down(&all_mddevs_sem); + ITERATE_MDDEV_LOCK(mddev,tmp) { sz += sprintf(page + sz, "md%d : %sactive", mdidx(mddev), mddev->pers ? "" : "in"); if (mddev->pers) { @@ -3224,6 +3297,7 @@ } sz += sprintf(page + sz, "\n"); } + up(&all_mddevs_sem); sz += status_unused(page + sz); return sz; @@ -3355,6 +3429,7 @@ recheck: serialize = 0; + down(&all_mddevs_sem); ITERATE_MDDEV(mddev2,tmp) { if (mddev2 == mddev) continue; @@ -3366,6 +3441,7 @@ break; } } + up(&all_mddevs_sem); if (serialize) { interruptible_sleep_on(&resync_wait); if (md_signal_pending(current)) { @@ -3512,8 +3588,10 @@ struct md_list_head *tmp; printk(KERN_INFO "md: recovery thread got woken up ...\n"); -restart: - ITERATE_MDDEV(mddev,tmp) { + + restart: + down(&all_mddevs_sem); + ITERATE_MDDEV_LOCK(mddev,tmp) { sb = mddev->sb; if (!sb) continue; @@ -3540,9 +3618,13 @@ continue; if (mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_WRITE)) continue; + unlock_mddev(mddev); + up(&all_mddevs_sem); down(&mddev->recovery_sem); mddev->recovery_running = 1; err = md_do_sync(mddev, spare); + + lock_mddev(mddev); if (err == -EIO) { printk(KERN_INFO "md%d: spare disk %s failed, skipping to next spare.\n", mdidx(mddev), partition_name(MKDEV(spare->major,spare->minor))); @@ -3568,26 +3650,28 @@ DISKOP_SPARE_INACTIVE); up(&mddev->recovery_sem); mddev->recovery_running = 0; - continue; } else { mddev->recovery_running = 0; up(&mddev->recovery_sem); + + if (!disk_faulty(spare)) { + /* + * the SPARE_ACTIVE diskop possibly changes the + * pointer too + */ + mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE); + mark_disk_sync(spare); + mark_disk_active(spare); + sb->active_disks++; + sb->spare_disks--; + } + mddev->sb_dirty = 1; + md_update_sb(mddev); } - if (!disk_faulty(spare)) { - /* - * the SPARE_ACTIVE diskop possibly changes the - * pointer too - */ - mddev->pers->diskop(mddev, &spare, DISKOP_SPARE_ACTIVE); - mark_disk_sync(spare); - mark_disk_active(spare); - sb->active_disks++; - sb->spare_disks--; - } - mddev->sb_dirty = 1; - md_update_sb(mddev); + unlock_mddev(mddev); goto restart; } + up(&all_mddevs_sem); printk(KERN_INFO "md: recovery thread finished ...\n"); } @@ -3603,7 +3687,7 @@ printk(KERN_INFO "md: stopping all md devices.\n"); - ITERATE_MDDEV(mddev,tmp) + ITERATE_MDDEV_LOCK(mddev,tmp) do_md_stop (mddev, 1); /* * certain more exotic SCSI devices are known to be @@ -3944,6 +4028,9 @@ mddev->sb_dirty = 0; do_md_stop(mddev, 0); printk(KERN_WARNING "md: starting md%d failed\n", minor); + } else { + unlock_mddev(mddev); + atomic_dec(&mddev->active); } } } --- ./drivers/md/raid5.c 2002/03/07 03:30:54 1.1 +++ ./drivers/md/raid5.c 2002/03/08 05:19:36 @@ -1293,8 +1293,12 @@ handled = 0; - if (mddev->sb_dirty) - md_update_sb(mddev); + if (mddev->sb_dirty) { + lock_mddev(mddev); + if (mddev->sb_dirty) + md_update_sb(mddev); + unlock_mddev(mddev); + } md_spin_lock_irq(&conf->device_lock); while (1) { struct list_head *first; --- ./drivers/md/raid1.c 2002/03/07 03:30:55 1.1 +++ ./drivers/md/raid1.c 2002/03/07 03:30:55 @@ -1136,8 +1136,12 @@ md_spin_unlock_irqrestore(&retry_list_lock, flags); mddev = r1_bh->mddev; - if (mddev->sb_dirty) - md_update_sb(mddev); + if (mddev->sb_dirty) { + lock_mddev(mddev); + if (mddev->sb_dirty) + md_update_sb(mddev); + unlock_mddev(mddev); + } bh = &r1_bh->bh_req; switch(r1_bh->cmd) { case SPECIAL: --- ./drivers/md/multipath.c 2002/03/07 03:30:55 1.1 +++ ./drivers/md/multipath.c 2002/03/07 03:30:55 @@ -701,8 +701,12 @@ md_spin_unlock_irqrestore(&retry_list_lock, flags); mddev = mp_bh->mddev; - if (mddev->sb_dirty) - md_update_sb(mddev); + if (mddev->sb_dirty) { + lock_mddev(mddev); + if (mddev->sb_dirty) + md_update_sb(mddev); + unlock_mddev(mddev); + } bh = &mp_bh->bh_req; dev = bh->b_dev; - To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html