[md PATCH] reshape checkpointing support for external IMSM metadata

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hello Neil,
I'm sending a patch proposal for reshape checkpointing in IMSM.
This patch is a continuation of the OLCE (online capacity expansion)
patches sent by Adam Kwolek some time ago, so please apply this one after those patches.

Support for IMSM Migration Record was added.
md reads the migration record when starting the reshape. The record is initialized by mdmon (with current familyNum from superblock).
md calculates reshape parameters in raid5_init_reshape_imsm().
the migration record is updated through: md_update_sb -> pers -> write_migr_record().
The reshape_postion notifications are simplified; we do not wait for metadata update notifications from the user space since
curr_migr_unit is now updated in the migration record directly in md.
Probably we could remove sysfs:reshape_postion notification at all since IMSM OROM uses curr_migr_unit from the migration record.

The patch is a first part of IMSM checkpointing and is not the complete feature yet...
Still the migration copy area should be implemented to be fully compatible with IMSM OROM.

Regards,
Maciek.

---
 drivers/md/md.c    |   96 ++++++++------------------
 drivers/md/md.h    |    4 +
 drivers/md/raid5.c |  190 ++++++++++++++++++++++++++++++++++++++++++++++++++--
 drivers/md/raid5.h |   20 +++++
 4 files changed, 236 insertions(+), 74 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index f418ef6..99547e2 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -1895,6 +1895,25 @@ static void print_rdev(mdk_rdev_t *rdev, int major_version)
                printk(KERN_INFO "md: no rdev superblock!\n");
 }

+mddev_t *md_find_parent_container(mddev_t *my_mddev)
+{
+       struct list_head *head;
+       mddev_t *mddev;
+
+       if (!my_mddev->external)
+               return NULL;
+
+       for_each_mddev(mddev, head) {
+               if (!mddev->gendisk)
+                       continue;
+               if (strncmp(my_mddev->metadata_type + 1,
+                                       mddev->gendisk->disk_name,
+                                       strlen(mddev->gendisk->disk_name)) == 0)
+                       return mddev;
+       }
+       return NULL;
+}
+
 static void md_print_devices(void)
 {
        struct list_head *tmp;
@@ -2050,8 +2069,10 @@ repeat:
                /* OLCE: reshape for external meta */
                if (mddev->external &&
                    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) {
-                       /* clear single notification guard */
-                       clear_bit(MD_CHANGE_PENDING_EXT, &mddev->flags);
+
+                       /* update migration record */
+                       if (mddev->pers->write_migr_record)
+                               mddev->pers->write_migr_record(mddev);

                        /* for external meta clean MD_CHANGE_PENDING flag
                         * as changes will be made by user space code
@@ -3658,16 +3679,6 @@ sync_completed_store(mddev_t *mddev, const char *buf, size_t len)
        if (!mddev->external)
                return -EINVAL;

-       /* put notification for external meta on any value put in **/
-       if (test_and_clear_bit(MD_CHANGE_PENDING_EXT, &mddev->flags)) {
-               /* reentry bit clear */
-               clear_bit(MD_CHANGE_PENDING_EXT, &mddev->flags);
-
-               /* wake up */
-               clear_bit(MD_CHANGE_DEVS, &mddev->flags);
-               wake_up(&mddev->sb_wait);
-       }
-
        /* check if numeric */
        if (strict_strtoull(buf, 10, &passedValue))
                return -EINVAL;
@@ -6777,6 +6788,8 @@ static int remove_and_add_spares(mddev_t *mddev)
        }
        return spares;
 }
+
+
 /*
  * This routine is regularly called by all per-raid-array threads to
  * deal with generic issues like resync and super-block update.
@@ -6829,62 +6842,10 @@ void md_check_recovery(mddev_t *mddev)
                (mddev->safemode == 2 && ! atomic_read(&mddev->writes_pending)
                 && !mddev->in_sync && mddev->recovery_cp == MaxSector)
                )) {
-               /* OLCE */
-               if (mddev->external) {
-                       int send_Notification = 0;
-
-                       /* reentry check */
-                       if (!test_bit(MD_CHANGE_PENDING_EXT, &mddev->flags)) {
-                               /* triger monitor and wait for uptate meta */
-                               if ((mddev->reshape_position > 0) &&
-                                   (mddev->reshape_position != MaxSector)) {
-                                       /* signal user space if reshape is in progress only
-                                        * use  single notification guard
-                                        */
-                                       set_bit(MD_CHANGE_PENDING_EXT, &mddev->flags);
-                                       /* Notify reshape_position */
-                                       send_Notification = 1;
-                               }
-                       } else {
-                               /* for external meta, when we are blocked
-                                * on flag we have to check if
-                                * kthread_should_stop().  if so,
-                                * kick it internally
-                                * (user space part can be unavailable
-                                * to complete action, this allows thread
-                                * to wake up and terminate)
-                                */
-                               if (kthread_should_stop()) {
-                                       /* clean single notification guard */
-                                       clear_bit(MD_CHANGE_PENDING_EXT, &mddev->flags);
-                                       /* notification */
-                                       clear_bit(MD_CHANGE_DEVS, &mddev->flags);
-                                       wake_up(&mddev->sb_wait);
-                               } else {
-                                       /* check if user space (mdmon is alive */
-                                       /* nothing changed */
-                                       if (mddev->reshape_position == mddev->reshape_position_sent) {
-                                               if ((get_seconds() - mddev->reshape_position_sent_timestamp) > 15) {
-                                                       printk(KERN_INFO "OLCE: mdmon is dead ?\n");
-                                                       send_Notification = 1;
-                                               }
-                                       }
-                               }
-                       }
-
-                       if (send_Notification) {
-                               /* turn off hand shake with mdmon
-                                * - at this moment checkpointing is implemented in md
-                                */
-                               /*
-                               sysfs_notify(&mddev->kobj, NULL, "reshape_position");
-                               mddev->reshape_position_sent = mddev->reshape_position;
-                               mddev->reshape_position_sent_timestamp = get_seconds();
-                               */

+               /* Updating external metadata */
+               if ((mddev->external) && (mddev->flags))
                                md_update_sb(mddev, 0);
-                       }
-               }
                return;
        }

@@ -7230,6 +7191,9 @@ EXPORT_SYMBOL(md_register_thread);
 EXPORT_SYMBOL(md_unregister_thread);
 EXPORT_SYMBOL(md_wakeup_thread);
 EXPORT_SYMBOL(md_check_recovery);
+EXPORT_SYMBOL(md_super_write);
+EXPORT_SYMBOL(md_super_wait);
+EXPORT_SYMBOL(md_find_parent_container);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS("md");
 MODULE_ALIAS_BLOCKDEV_MAJOR(MD_MAJOR);
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 11885c8..bd54665 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -121,7 +121,6 @@ struct mddev_s

 #define MD_SYNC_COMPLETED_SHOW_0 3  /* force to show in sync complete 0 */
                                    /*  when it is set from user space  */
-#define MD_CHANGE_PENDING_EXT 4        /* superblock update in progress external */
        int                             suspended;
        atomic_t                        active_io;
        int                             ro;
@@ -357,6 +356,8 @@ struct mdk_personality
         * by mdmon
         */
        int (*wait_reshape)(mddev_t *mddev, unsigned int useTimeout);
+       /* writes Migration Record while reshaping */
+       void (*write_migr_record) (mddev_t *mddev);
 };


@@ -456,5 +457,6 @@ extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors);
 extern int md_check_no_bitmap(mddev_t *mddev);
 extern int md_integrity_register(mddev_t *mddev);
 void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev);
+extern mddev_t *md_find_parent_container(mddev_t *my_mddev);

 #endif /* _MD_MD_H */
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 656a852..d8baa11 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -3920,13 +3920,6 @@ static int make_request(struct request_queue *q, struct bio * bi)
                                        spin_unlock_irq(&conf->device_lock);
                                        schedule();
                                        goto retry;
-                               } else {
-                                       if ((mddev->delta_disks > 0) &&
-                                           (logical_sector < conf->reshape_safe)) {
-                                               spin_unlock_irq(&conf->device_lock);
-                                               schedule();
-                                               goto retry;
-                                       }
                                }
                        }
                        spin_unlock_irq(&conf->device_lock);
@@ -4020,6 +4013,79 @@ static int make_request(struct request_queue *q, struct bio * bi)
        return 0;
 }

+/*
+ * IMSM specific code.
+ * Writes reshape checkpoint to IMSM Migration Record
+ */
+#define RAID_DISK_RESERVED_BLOCKS_IMSM_HI 417
+/* General migration checkpointing only occurs to at most 2 disks */
+#define MAX_CKPT_DISKS_IMSM 2
+
+void raid5_update_migr_record_imsm(mddev_t *mddev)
+{
+       raid5_conf_t *conf = mddev->private;
+       struct imsm_migr_record *migr_rec;
+       struct page *ckpt_page;
+       mdk_rdev_t *rdev;
+       long long unsigned curr_migr_unit;
+       sector_t dev_sectors, migr_rec_sector;
+       sector_t min_dev_sectors = -1LLU;
+       mddev_t *parent;
+
+       parent = md_find_parent_container(mddev);
+       if (!parent)
+               return;
+
+       /* if meta is not IMSM, exit */
+       if (strcmp(parent->metadata_type, "imsm") != 0)
+               return;
+
+       ckpt_page = alloc_page(GFP_KERNEL);
+       if (!ckpt_page) {
+               printk(KERN_ALERT "md: out of memory.\n");
+               return;
+       }
+
+       migr_rec = (struct imsm_migr_record *) page_address(ckpt_page);
+       memcpy(migr_rec, &conf->migr_rec, sizeof(*migr_rec));
+       migr_rec->recStatus = 0;   /* FIX ME: Implement IMSM copy Area!!! */
+       curr_migr_unit = conf->reshape_progress;
+
+       if (migr_rec->blocksPerUnit == 0) {
+               /* Migration Record is not ready yet...*/
+               return;
+       }
+       sector_div(curr_migr_unit, migr_rec->blocksPerUnit);
+
+       migr_rec->dest1stMemberLba = curr_migr_unit * migr_rec->destDepthPerUnit;
+       migr_rec->currMigrUnit = (u32)curr_migr_unit + 1;
+
+       /* First find the smallest dev */
+       list_for_each_entry(rdev, &mddev->disks, same_set) {
+               dev_sectors = rdev->bdev->bd_inode->i_size / 512;
+               if (dev_sectors < min_dev_sectors)
+                       min_dev_sectors = dev_sectors;
+       }
+
+       migr_rec->ckptAreaPba = min_dev_sectors - RAID_DISK_RESERVED_BLOCKS_IMSM_HI;
+
+       list_for_each_entry(rdev, &mddev->disks, same_set) {
+               if (rdev->raid_disk <= (MAX_CKPT_DISKS_IMSM - 1)) {
+                       char b[BDEVNAME_SIZE];
+                       dev_sectors = rdev->bdev->bd_inode->i_size / 512;
+
+                       /* Checkpoint Record is stored at the very last sector */
+                       migr_rec_sector = dev_sectors - 1;
+
+                       if (!test_bit(Faulty, &rdev->flags)) {
+                               md_super_write(mddev, rdev,
+                                                          migr_rec_sector, 512, ckpt_page);
+                       }
+                       md_super_wait(mddev);
+               }
+       }
+}
+
 static sector_t raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks);

 static sector_t reshape_request(mddev_t *mddev, sector_t sector_nr, int *skipped)
@@ -5044,6 +5110,7 @@ static int run(mddev_t *mddev)

        if (conf->reshape_progress != MaxSector) {
                printk("...ok start reshape thread\n");
+               conf->reshape_checkpoint = jiffies;
                conf->reshape_safe = conf->reshape_progress;
                atomic_set(&conf->reshape_stripes, 0);
                clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
@@ -5052,6 +5119,7 @@ static int run(mddev_t *mddev)
                set_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
                mddev->sync_thread = md_register_thread(md_do_sync, mddev,
                                                        "reshape");
+               mddev->resync_max = 0;
        }

        /* read-ahead size must cover two whole stripes, which is
@@ -5887,6 +5955,106 @@ static void *raid6_takeover(mddev_t *mddev)
        mddev->raid_disks += 1;
        return setup_conf(mddev);
 }
+/*
+ * IMSM specific code.
+ * Reads Migration Record from the array device.
+ */
+static int raid5_load_migr_record_imsm(mddev_t *mddev,
+                                                                          struct imsm_migr_record **init_migr_rec)
+{
+       struct page *ckpt_page;
+       mdk_rdev_t *rdev;
+       sector_t sector;
+       int retval = -1;
+
+       ckpt_page = alloc_page(GFP_KERNEL);
+       if (!ckpt_page) {
+               printk(KERN_ALERT "md: out of memory.\n");
+               return -ENOMEM;
+       }
+
+       *init_migr_rec = (struct imsm_migr_record *) page_address(ckpt_page);
+
+       list_for_each_entry(rdev, &mddev->disks, same_set) {
+               if (rdev->raid_disk <= (MAX_CKPT_DISKS_IMSM - 1)) {
+                       char b[BDEVNAME_SIZE];
+
+                       sector = rdev->bdev->bd_inode->i_size / 512 - 1;
+
+                       if (!test_bit(Faulty, &rdev->flags)) {
+                               if (sync_page_io(rdev->bdev, sector, 512, ckpt_page, READ)) {
+                                       retval = 0;
+                                       break;
+                               }
+                       }
+               }
+       }
+       return retval;
+}
+
+/*
+ * IMSM specific code.
+ * Initializes reshape configuration for external IMSM metadata
+ */
+#define SECT_PER_MB_SHIFT 11
+
+static int raid5_init_reshape_imsm(mddev_t *mddev, raid5_conf_t *conf)
+{
+       struct imsm_migr_record *init_migr_rec;
+       int new_data_disks, prev_data_disks;
+       int prev_stripe_sectors, new_stripe_sectors;
+       long long unsigned new_array_sectors;
+       unsigned blocks_per_unit;
+
+       /* Load the existing imsm Migr Record
+        * it should be initialized by mdmon
+        */
+       if (raid5_load_migr_record_imsm(mddev, &init_migr_rec) != 0) {
+               printk(KERN_ALERT "md: could not read IMSM migr record.\n");
+               return -1;
+       }
+
+       conf->migr_rec.recStatus = 0;
+       conf->migr_rec.currMigrUnit = conf->reshape_progress;
+
+       /* Copy family number from migr record on raid dev
+        * it was set by the IMSM user space when starting migration
+        */
+       conf->migr_rec.familyNum = init_migr_rec->familyNum;
+       conf->migr_rec.ascendingMigr = (mddev->delta_disks > 0) ? 1 : 0;
+       prev_data_disks = conf->previous_raid_disks - conf->max_degraded;
+       new_data_disks = conf->raid_disks - conf->max_degraded;
+
+       new_array_sectors = mddev->dev_sectors;
+       new_array_sectors &= ~(unsigned long long)(mddev->chunk_sectors-1);
+       new_array_sectors *= new_data_disks;
+
+       new_array_sectors = (new_array_sectors >> SECT_PER_MB_SHIFT)
+               << SECT_PER_MB_SHIFT;
+
+       conf->migr_rec.postMigrVolCapacity = new_array_sectors;
+       conf->migr_rec.postMigrVolCapacityHi = new_array_sectors >> 32;
+
+       prev_stripe_sectors =  conf->prev_chunk_sectors * prev_data_disks;
+       new_stripe_sectors = conf->chunk_sectors * new_data_disks;
+
+       if (prev_stripe_sectors > new_stripe_sectors)
+               blocks_per_unit = prev_stripe_sectors;
+       else
+               blocks_per_unit = new_stripe_sectors;
+
+       new_array_sectors = mddev->dev_sectors;
+       new_array_sectors *= new_data_disks;
+
+       sector_div(new_array_sectors, blocks_per_unit);
+       conf->migr_rec.numMigrUnits = new_array_sectors;
+       conf->migr_rec.destDepthPerUnit = blocks_per_unit /
+               new_data_disks;
+
+       conf->migr_rec.blocksPerUnit = blocks_per_unit;
+
+       return 0;
+}

 /****************************************************************
  *  for external meta we have to wait until sync_max in sysfs is 0
@@ -5968,6 +6136,7 @@ static int raid5_wait_reshape(mddev_t *mddev, unsigned int useTimeout)
                        mdk_rdev_t *rdev;
                        int added_devices = 0;
                        unsigned long flags;
+                       mddev_t *parent;

                        /* Add some new drives, as many as will fit. */
                        /* We know there are enough to make
@@ -6028,6 +6197,12 @@ static int raid5_wait_reshape(mddev_t *mddev, unsigned int useTimeout)
                                printk(KERN_WARNING "md: %s: Second check PASSED "
                                        "(raid5 reshape).\n",
                                        mdname(mddev));
+
+                               /* for IMSM meta handle Migration Record */
+                               parent = md_find_parent_container(mddev);
+                               if (parent && (!strcmp(parent->metadata_type, "imsm")))
+                                       if (raid5_init_reshape_imsm(mddev, conf) != 0)
+                                               retVal = 0;
                        }
                }
        }
@@ -6083,6 +6258,7 @@ static struct mdk_personality raid5_personality =
        .quiesce        = raid5_quiesce,
        .takeover       = raid5_takeover,
        .wait_reshape   = raid5_wait_reshape,
+       .write_migr_record = raid5_update_migr_record_imsm,
 };

 static struct mdk_personality raid4_personality =
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index dd70835..7f6387e 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -338,6 +338,24 @@ struct r6_state {
  * HANDLE gets cleared if stripe_handle leave nothing locked.
  */

+struct imsm_migr_record {
+       u32 recStatus; /* Status used to determine how to restart
+                                       * migration in case it aborts in some fashion */
+       u32 currMigrUnit; /* 0..numMigrUnits-1 */
+       u32 familyNum; /* Family number of MPB containing the RaidDev that is migrating */
+       u32 ascendingMigr; /* True if migrating in increasing order of lbas */
+       u32 blocksPerUnit; /* Num disk blocks per unit of operation */
+       u32 destDepthPerUnit; /* Num member blocks each destMap member disk
+                                                  *  advances per unit-of-operation */
+       u32 ckptAreaPba;           /* Pba of first block of ckpt copy area */
+       u32 dest1stMemberLba;  /* First member lba on first stripe of destination */
+       u32 numMigrUnits;          /* Total num migration units-of-op */
+       u32 postMigrVolCapacity;  /* Size of volume after migration completes */
+       u32 postMigrVolCapacityHi;/*  Expansion space for LBA64 */
+       u32 ckptReadDiskNum;      /* Which member disk in destSubMap[0] the
+                                                          *  migration ckpt record was read from
+                                                          *  (for recovered migrations) */
+};

 struct disk_info {
        mdk_rdev_t      *rdev;
@@ -369,6 +387,8 @@ struct raid5_private_data {
        short                   generation; /* increments with every reshape */
        unsigned long           reshape_checkpoint; /* Time we last updated
                                                     * metadata */
+       /* reshape: imsm specific fields passed through the migration record */
+       struct imsm_migr_record migr_rec;

        struct list_head        handle_list; /* stripes needing handling */
        struct list_head        hold_list; /* preread ready stripes */

��.n��������+%������w��{.n�����{����w��ܨ}���Ơz�j:+v�����w����ޙ��&�)ߡ�a����z�ޗ���ݢj��w�f


[Index of Archives]     [Linux RAID Wiki]     [ATA RAID]     [Linux SCSI Target Infrastructure]     [Linux Block]     [Linux IDE]     [Linux SCSI]     [Linux Hams]     [Device Mapper]     [Device Mapper Cryptographics]     [Kernel]     [Linux Admin]     [Linux Net]     [GFS]     [RPM]     [git]     [Yosemite Forum]


  Powered by Linux