Add support for journal disk hot add/remove. Mostly trival checks in md part. The raid5 part is a little tricky. For hot-remove, we can't wait pending write as it's called from raid5d. The wait will cause deadlock. We simplily fail the hot-remove. A hot-remove retry can success eventually since if journal disk is faulty all pending write will be failed and finish. For hot-add, since an array supporting journal but without journal disk will be marked read-only, we are safe to hot add journal without stopping IO (should be read IO, while journal only handles write IO). Signed-off-by: Shaohua Li <shli@xxxxxx> --- drivers/md/md.c | 44 +++++++++++++++++++++++++++++++++----------- drivers/md/raid5.c | 31 +++++++++++++++++++++++-------- 2 files changed, 56 insertions(+), 19 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 807095f..de9e3a1 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2053,7 +2053,9 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) /* make sure rdev->sectors exceeds mddev->dev_sectors */ if (rdev->sectors && (mddev->dev_sectors == 0 || - rdev->sectors < mddev->dev_sectors)) { + rdev->sectors < mddev->dev_sectors) && + !(test_bit(MD_HAS_JOURNAL, &mddev->flags) && + test_bit(Journal, &rdev->flags))) { if (mddev->pers) { /* Cannot change size, so fail * If mddev->level <= 0, then we don't care @@ -2084,7 +2086,9 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) } } rcu_read_unlock(); - if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks) { + if (mddev->max_disks && rdev->desc_nr >= mddev->max_disks && + !(test_bit(MD_HAS_JOURNAL, &mddev->flags) && + test_bit(Journal, &rdev->flags))) { printk(KERN_WARNING "md: %s: array is limited to %d devices\n", mdname(mddev), mddev->max_disks); return -EBUSY; @@ -6031,8 +6035,25 @@ static int add_new_disk(struct mddev *mddev, mdu_disk_info_t *info) else clear_bit(WriteMostly, &rdev->flags); - if (info->state & (1<<MD_DISK_JOURNAL)) + if (info->state & (1<<MD_DISK_JOURNAL)) { + struct md_rdev *rdev2; + bool has_journal = false; + + /* make sure no existing journal disk */ + rcu_read_lock(); + rdev_for_each(rdev2, mddev) { + if (test_bit(Journal, &rdev2->flags)) { + has_journal = true; + break; + } + } + rcu_read_unlock(); + if (has_journal) { + export_rdev(rdev); + return -EBUSY; + } set_bit(Journal, &rdev->flags); + } /* * check whether the device shows up in other nodes */ @@ -8162,19 +8183,20 @@ static int remove_and_add_spares(struct mddev *mddev, continue; if (test_bit(Faulty, &rdev->flags)) continue; - if (test_bit(Journal, &rdev->flags)) - continue; - if (mddev->ro && - ! (rdev->saved_raid_disk >= 0 && - !test_bit(Bitmap_sync, &rdev->flags))) - continue; + if (!test_bit(Journal, &rdev->flags)) { + if (mddev->ro && + ! (rdev->saved_raid_disk >= 0 && + !test_bit(Bitmap_sync, &rdev->flags))) + continue; - rdev->recovery_offset = 0; + rdev->recovery_offset = 0; + } if (mddev->pers-> hot_add_disk(mddev, rdev) == 0) { if (sysfs_link_rdev(mddev, rdev)) /* failure here is OK */; - spares++; + if (!test_bit(Journal, &rdev->flags)) + spares++; md_new_event(mddev); set_bit(MD_CHANGE_DEVS, &mddev->flags); } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 704ef7f..f8f3d52 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -7141,14 +7141,16 @@ static int raid5_remove_disk(struct mddev *mddev, struct md_rdev *rdev) struct disk_info *p = conf->disks + number; print_raid5_conf(conf); - if (test_bit(Journal, &rdev->flags)) { + if (test_bit(Journal, &rdev->flags) && conf->log) { /* - * journal disk is not removable, but we need give a chance to - * update superblock of other disks. Otherwise journal disk - * will be considered as 'fresh' + * we can't wait pending write here, as this is called in + * raid5d, wait will deadlock. */ - set_bit(MD_CHANGE_DEVS, &mddev->flags); - return -EINVAL; + if (atomic_read(&mddev->writes_pending)) + return -EBUSY; + r5l_exit_log(conf->log); + conf->log = NULL; + return 0; } if (rdev == p->rdev) rdevp = &p->rdev; @@ -7212,8 +7214,21 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) int first = 0; int last = conf->raid_disks - 1; - if (test_bit(Journal, &rdev->flags)) - return -EINVAL; + if (test_bit(Journal, &rdev->flags)) { + char b[BDEVNAME_SIZE]; + if (conf->log) + return -EBUSY; + + rdev->raid_disk = mddev->raid_disks; + /* + * The array is in readonly mode if journal is missing, so no + * write requests running. We should be safe + */ + r5l_init_log(conf, rdev); + printk(KERN_INFO"md/raid:%s: using device %s as journal\n", + mdname(mddev), bdevname(rdev->bdev, b)); + return 0; + } if (mddev->recovery_disabled == conf->recovery_disabled) return -EBUSY; -- 2.4.6 -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html