This the first step in allowing md to track bad-blocks per-device so that we can fail individual blocks rather than the whole device. This patch just adds a data structure for recording bad blocks, with routines to add, remove, search the list. Signed-off-by: NeilBrown <neilb@xxxxxxx> --- drivers/md/dm-raid456.c | 6 + drivers/md/md.c | 427 +++++++++++++++++++++++++++++++++++++++++++++++ drivers/md/md.h | 46 +++++ 3 files changed, 475 insertions(+), 4 deletions(-) diff --git a/drivers/md/dm-raid456.c b/drivers/md/dm-raid456.c index 3dcbc4a..5030d16 100644 --- a/drivers/md/dm-raid456.c +++ b/drivers/md/dm-raid456.c @@ -112,7 +112,11 @@ static int dev_parms(struct raid_set *rs, char **argv) int err = 0; unsigned long long offset; - md_rdev_init(&rs->dev[i].rdev); + err = md_rdev_init(&rs->dev[i].rdev); + if (err) { + rs->ti->error = "Memory allocation failure"; + return err; + } rs->dev[i].rdev.raid_disk = i; if (strcmp(argv[0], "-") == 0) diff --git a/drivers/md/md.c b/drivers/md/md.c index e0a9bf8..8ae8322 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1900,6 +1900,10 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) sysfs_remove_link(&rdev->kobj, "block"); sysfs_put(rdev->sysfs_state); rdev->sysfs_state = NULL; + kfree(rdev->badblocks.page); + rdev->badblocks.count = 0; + rdev->badblocks.page = NULL; + rdev->badblocks.active_page = NULL; /* We need to delay this, otherwise we can deadlock when * writing to 'remove' to "dev/state". We also need * to delay it due to rcu usage. @@ -2738,7 +2742,7 @@ static struct kobj_type rdev_ktype = { .default_attrs = rdev_default_attrs, }; -void md_rdev_init(mdk_rdev_t *rdev) +int md_rdev_init(mdk_rdev_t *rdev) { rdev->desc_nr = -1; rdev->saved_raid_disk = -1; @@ -2754,6 +2758,20 @@ void md_rdev_init(mdk_rdev_t *rdev) INIT_LIST_HEAD(&rdev->same_set); init_waitqueue_head(&rdev->blocked_wait); + + /* Add space to store bad block list. + * This reserves the space even on arrays where it cannot + * be used - I wonder if that matters + */ + rdev->badblocks.count = 0; + rdev->badblocks.shift = 0; + rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); + rdev->badblocks.active_page = rdev->badblocks.page; + spin_lock_init(&rdev->badblocks.lock); + if (rdev->badblocks.page == NULL) + return -ENOMEM; + + return 0; } EXPORT_SYMBOL_GPL(md_rdev_init); /* @@ -2779,7 +2797,8 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi return ERR_PTR(-ENOMEM); } - md_rdev_init(rdev); + if ((err = md_rdev_init(rdev))) + goto abort_free; if ((err = alloc_disk_sb(rdev))) goto abort_free; @@ -7212,6 +7231,410 @@ void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) } EXPORT_SYMBOL(md_wait_for_blocked_rdev); + +/* Bad block management. + * We can record which blocks on each device are 'bad' and so just + * fail those blocks, or that stripe, rather than whole device. + * Entries in the bad-block table are 64bits wide. This comprises: + * Length of bad-range, in sectors: 0-511 for lengths 1-512 + * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) + * A 'shift' can be set so that larger blocks are tracked and + * consequently larger devices can be covered. + * 'Acknowledged' flag - 1 bit. - the most significant bit. + */ +/* Locking of the bad-block table is a two-layer affair. + * Read access through ->active_page only require an rcu_readlock. + * However is ->active_page is found to be NULL, the table + * should be accessed through ->page which requires a spinlock. + * Updating the page requires setting ->active_page to NULL, + * synchronising with rcu, then updating ->page under the same + * spinlock. + * + */ +/* When looking for a bad block we specify a range and want to + * know if any block in the range is bad. So we binary-search + * to the last range that starts at-or-before the given endpoint, + * (or "before the sector after the target range") + * then see if it ends after the given start. + * We return + * 0 if there are no known bad blocks in the range + * 1 if there are known bad block which are all acknowledged + * -1 if there are bad blocks which have not yet been acknowledged in metadata. + * plus the start/length of the first bad section we overlap. + */ +int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + int hi; + int lo = 0; + u64 *p; + int rv = 0; + int havelock = 0; + sector_t target = s + sectors; + + if (bb->shift) { + /* round down the start, and up the end */ + s >>= bb->shift; + target |= (1<<bb->shift) - 1; + target++; + target >>= bb->shift; + sectors = target - s; + } + /* 'target' is now the first block after the bad range */ + + rcu_read_lock(); + p = rcu_dereference(bb->active_page); + if (!p) { + spin_lock(&bb->lock); + p = bb->page; + havelock = 1; + } + hi = bb->count; + + /* Binary search between lo and hi for 'target' + * i.e. for the last range that starts before 'target' + */ + /* INVARIANT: ranges before 'lo' and at-or-after 'hi' + * are known not to be the last range before target. + * VARIANT: hi-lo is the number of possible + * ranges, and decreases until it reaches 1 + */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + if (a < target) + /* The could still be the one, earlier ranges + * could not. */ + lo = mid; + else + /* This and later ranges are definitely out. */ + hi = mid; + } + /* 'lo' might be the last that started before target, but 'hi' isn't */ + if (hi > lo) { + /* need to check all range that end after 's' to see if + * any are unacknowledged. + */ + while (lo >= 0 && + BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { + /* starts before the end, and finishes after + * the start, so they must overlap + */ + if (rv != -1 && BB_ACK(p[lo])) + rv = 1; + else + rv = -1; + *first_bad = BB_OFFSET(p[lo]); + *bad_sectors = BB_LEN(p[lo]); + lo--; + } + } + + if (havelock) + spin_unlock(&bb->lock); + rcu_read_unlock(); + return rv; +} +EXPORT_SYMBOL_GPL(md_is_badblock); + +/* + * Add a range of bad blocks to the table. + * This might extend the table, or might contract it + * if two adjacent ranges can be merged. + * We binary-search to find the 'insertion' point, then + * decide how best to handle it. + */ +int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) +{ + u64 *p; + int lo, hi; + int rv = 1; + + if (bb->shift) { + /* round down the start, and up the end */ + sector_t next = s + sectors; + s >>= bb->shift; + next |= (1<<bb->shift) - 1; + next++; + next >>= bb->shift; + sectors = next - s; + } + +again: + rcu_assign_pointer(bb->active_page, NULL); + synchronize_rcu(); + spin_lock(&bb->lock); + if (bb->active_page) { + /* someone else just unlocked, better retry */ + spin_unlock(&bb->lock); + goto again; + } + /* now have exclusive access to the page */ + +add_more: + p = bb->page; + lo = 0; + hi = bb->count; + /* Find the last range that starts at-or-before 's' */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + if (a <= s) + lo = mid; + else + hi = mid; + } + if (hi > lo && BB_OFFSET(p[lo]) > s) + hi = lo; + + if (hi > lo) { + /* we found a range that might merge with the start + * of our new range + */ + sector_t a = BB_OFFSET(p[lo]); + sector_t e = a + BB_LEN(p[lo]); + int ack = BB_ACK(p[lo]); + if (e >= s) { + /* Yes, we can merge with a previous range */ + if (s <= a && s + sectors >= e) { + /* new range covers old */ + if (!ack) + ack = acknowledged; + } else { + if (!acknowledged) + ack = acknowledged; + } + if (e < s + sectors) + e = s + sectors; + if (s + sectors <= a + BB_MAX_LEN) { + p[lo] = BB_MAKE(a, e-a, ack); + s = e; + } else { + /* does not all fit in one range, + * make p[lo] maximal + */ + if (BB_LEN(p[lo]) != BB_MAX_LEN) + p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); + s = a + BB_MAX_LEN; + } + sectors = e - s; + } + } + if (sectors && hi < bb->count) { + /* 'hi' points to the first range that starts after 's'. + * Maybe we can merge with the start of that range */ + sector_t a = BB_OFFSET(p[hi]); + sector_t e = a + BB_LEN(p[hi]); + int ack = BB_ACK(p[hi]); + if (a <= (s + sectors)) { + /* merging is possible */ + if (e < s + sectors) + /* full overlap */ + e = s + sectors; + if (a > s) + a = s; + if (e - a <= BB_MAX_LEN) { + p[hi] = BB_MAKE(a, e-a, acknowledged && ack); + sectors = 0; + s = e; + } else { + p[hi] = BB_MAKE(a, BB_MAX_LEN, + acknowledged && ack); + s = a + BB_MAX_LEN; + sectors -= BB_MAX_LEN; + } + hi++; + } + } + if (sectors == 0 && hi < bb->count) { + /* we might be able to combine lo and hi */ + sector_t a = BB_OFFSET(p[hi]); + int lolen = BB_LEN(p[lo]); + int hilen = BB_LEN(p[hi]); + int newlen = lolen + hilen - (s - a); + if (s >= a && newlen < BB_MAX_LEN) { + /* yes, we can combine them */ + int ack = BB_ACK(p[lo]) || BB_ACK(p[hi]); + p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); + memmove(p + hi, p + hi + 1, + (bb->count - hi) * 8); + bb->count--; + } + } + if (sectors) { + /* didn't merge (it all). + * Need to add a range just before 'hi' */ + if (bb->count >= MD_MAX_BADBLOCKS) + /* No room for more */ + rv = 0; + else { + memmove(p + hi + 1, p + hi, + (bb->count - hi) * 8); + bb->count++; + if (sectors <= BB_MAX_LEN) + p[hi] = BB_MAKE(s, sectors, acknowledged); + else { + p[hi] = BB_MAKE(s, BB_MAX_LEN, acknowledged); + s += BB_MAX_LEN; + sectors -= BB_MAX_LEN; + goto add_more; + } + } + } + + bb->changed = 1; + rcu_assign_pointer(bb->active_page, bb->page); + spin_unlock(&bb->lock); + + return rv; +} +EXPORT_SYMBOL_GPL(md_set_badblocks); + +/* + * Remove a range of bad blocks from the table. + * This may involve extending the table if we spilt a region, + * but it must not fail. So if the table becomes full, we just + * drop the remove request. + */ +int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) +{ + u64 *p; + int lo, hi; + sector_t target = s + sectors; + int rv = 0; + + if (bb->shift) { + /* FIXME should this round the other way??? */ + /* round down the start, and up the end? + * It should never matter as block shift should + * be aligned with basic IO size, and this + * was seems safer + */ + s >>= bb->shift; + target |= (1<<bb->shift) - 1; + target++; + target >>= bb->shift; + sectors = target - s; + } + +again: + rcu_assign_pointer(bb->active_page, NULL); + synchronize_rcu(); + spin_lock(&bb->lock); + if (bb->active_page) { + /* someone else just unlocked, better retry */ + spin_unlock(&bb->lock); + goto again; + } + /* now have exclusive access to the page */ + + p = bb->page; + lo = 0; + hi = bb->count; + /* Find the last range that starts before 'target' */ + while (hi - lo > 1) { + int mid = (lo + hi) / 2; + sector_t a = BB_OFFSET(p[mid]); + if (a < target) + lo = mid; + else + hi = mid; + } + if (hi > lo) { + /* p[lo] is the last range that could overlap the + * current range. Earlier ranges could also overlap, + * but only this one can overlap the end of the range. + */ + if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { + /* Partial overlap, leave the tail of this range */ + int ack = BB_ACK(p[lo]); + sector_t a = BB_OFFSET(p[lo]); + sector_t end = a + BB_LEN(p[lo]); + + if (a < s) { + /* we need to split this range */ + if (bb->count >= MD_MAX_BADBLOCKS) { + rv = 0; + goto out; + } + memmove(p+lo+1, p+lo, (bb->count - lo) * 8); + bb->count++; + p[lo] = BB_MAKE(a, s-a, ack); + lo++; + } + p[lo] = BB_MAKE(target, end - target, ack); + /* there is no longer an overlap */ + hi = lo; + lo--; + } + while (lo >= 0 && + BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { + /* This range does overlap */ + if (BB_OFFSET(p[lo]) < s) { + /* Keep the early parts of this range. */ + int ack = BB_ACK(p[lo]); + sector_t start = BB_OFFSET(p[lo]); + p[lo] = BB_MAKE(start, s - start, ack); + /* now low doesn't overlap, so.. */ + break; + } + lo--; + } + /* 'lo' is strictly before, 'hi' is strictly after, + * anything between needs to be discarded + */ + if (hi - lo > 1) { + memmove(p+lo+1, p+hi, (bb->count - hi) * 8); + bb->count -= (hi - lo - 1); + } + } + + bb->changed = 1; +out: + rcu_assign_pointer(bb->active_page, bb->page); + spin_unlock(&bb->lock); + return rv; +} +EXPORT_SYMBOL_GPL(md_clear_badblocks); + +/* + * Acknowledge all bad blocks in a list. + * This only succeeds if ->changed is clear. It is used by + * in-kernel metadata updates + */ +void md_ack_all_badblocks(struct badblocks *bb) +{ + if (bb->page == NULL || bb->changed) + /* no point event trying */ + return; +again: + rcu_assign_pointer(bb->active_page, NULL); + synchronize_rcu(); + spin_lock(&bb->lock); + if (bb->active_page) { + /* someone else just unlocked, better retry */ + spin_unlock(&bb->lock); + goto again; + } + /* now have exclusive access to the page */ + + if (bb->changed == 0) { + u64 *p = bb->page; + int i; + for (i = 0; i < bb->count ; i++) { + if (!BB_ACK(p[i])) { + sector_t start = BB_OFFSET(p[i]); + int len = BB_LEN(p[i]); + p[i] = BB_MAKE(start, len, 1); + } + } + } + rcu_assign_pointer(bb->active_page, bb->page); + spin_unlock(&bb->lock); +} +EXPORT_SYMBOL_GPL(md_ack_all_badblocks); + static int md_notify_reboot(struct notifier_block *this, unsigned long code, void *x) { diff --git a/drivers/md/md.h b/drivers/md/md.h index e53b355..a24e131 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -49,6 +49,12 @@ static inline void plugger_flush(struct plug_handle *plug) cancel_work_sync(&plug->unplug_work); } +/* Bad block numbers are stored sorted in a single page. + * 64bits is used for each block or extent. + * 55 bits are sector number, 9 bits are extent size + */ +#define MD_MAX_BADBLOCKS (PAGE_SIZE/8) + /* * MD's 'extended' device */ @@ -125,8 +131,46 @@ struct mdk_rdev_s struct sysfs_dirent *sysfs_state; /* handle for 'state' * sysfs entry */ + + struct badblocks { + int count; /* count of bad blocks */ + int shift; /* shift from sectors to block size */ + u64 *page; /* badblock list */ + u64 *active_page; /* either 'page' or 'NULL' */ + int changed; + spinlock_t lock; + } badblocks; }; +#define BB_LEN_MASK (0x00000000000001FFULL) +#define BB_OFFSET_MASK (0x7FFFFFFFFFFFFE00ULL) +#define BB_ACK_MASK (0x8000000000000000ULL) +#define BB_MAX_LEN 512 +#define BB_OFFSET(x) (((x) & BB_OFFSET_MASK) >> 9) +#define BB_LEN(x) (((x) & BB_LEN_MASK) + 1) +#define BB_ACK(x) (!!((x) & BB_ACK_MASK)) +#define BB_MAKE(a, l, ack) (((a)<<9) | ((l)-1) | ((u64)(!!(ack)) << 63)) + +extern int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors); +static inline int is_badblock(mdk_rdev_t *rdev, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + if (unlikely(rdev->badblocks.count)) { + int rv = md_is_badblock(&rdev->badblocks, rdev->data_offset + s, + sectors, + first_bad, bad_sectors); + if (rv) + *first_bad -= rdev->data_offset; + return rv; + } + return 0; +} +extern int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, + int acknowledged); +extern int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors); +extern void md_ack_all_badblocks(struct badblocks *bb); + struct mddev_s { void *private; @@ -517,7 +561,7 @@ extern void mddev_init(mddev_t *mddev); extern int md_run(mddev_t *mddev); extern void md_stop(mddev_t *mddev); extern void md_stop_writes(mddev_t *mddev); -extern void md_rdev_init(mdk_rdev_t *rdev); +extern int md_rdev_init(mdk_rdev_t *rdev); extern void mddev_suspend(mddev_t *mddev); extern void mddev_resume(mddev_t *mddev); -- To unsubscribe from this list: send the line "unsubscribe linux-raid" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html