From: Darrick J. Wong <darrick.wong@xxxxxxxxxx> Manage the scheduling, issuance, and reporting of data block verification reads. This enables us to combine adjacent (or nearly adjacent) read requests, and to take advantage of high-IOPS devices by issuing IO from multiple threads. Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx> --- scrub/Makefile | 2 scrub/phase1.c | 1 scrub/phase2.c | 1 scrub/phase3.c | 1 scrub/phase5.c | 1 scrub/read_verify.c | 224 +++++++++++++++++++++++++++++++++++++++++++++++++++ scrub/read_verify.h | 58 +++++++++++++ scrub/scrub.c | 25 ++++++ scrub/scrub.h | 13 +++ 9 files changed, 326 insertions(+) create mode 100644 scrub/read_verify.c create mode 100644 scrub/read_verify.h diff --git a/scrub/Makefile b/scrub/Makefile index b1cd393..5df3e95 100644 --- a/scrub/Makefile +++ b/scrub/Makefile @@ -23,6 +23,7 @@ common.h \ counter.h \ disk.h \ ioctl.h \ +read_verify.h \ scrub.h \ xfs.h @@ -38,6 +39,7 @@ phase1.c \ phase2.c \ phase3.c \ phase5.c \ +read_verify.c \ scrub.c \ xfs.c diff --git a/scrub/phase1.c b/scrub/phase1.c index 6c3aab4..66f4aa3 100644 --- a/scrub/phase1.c +++ b/scrub/phase1.c @@ -25,6 +25,7 @@ #include "../repair/threads.h" #include "handle.h" #include "path.h" +#include "bitmap.h" #include "scrub.h" #include "common.h" #include "ioctl.h" diff --git a/scrub/phase2.c b/scrub/phase2.c index b8b44ac..88136a3 100644 --- a/scrub/phase2.c +++ b/scrub/phase2.c @@ -25,6 +25,7 @@ #include "../repair/threads.h" #include "handle.h" #include "path.h" +#include "bitmap.h" #include "scrub.h" #include "common.h" #include "ioctl.h" diff --git a/scrub/phase3.c b/scrub/phase3.c index cdd8a7c..b920995 100644 --- a/scrub/phase3.c +++ b/scrub/phase3.c @@ -25,6 +25,7 @@ #include "../repair/threads.h" #include "handle.h" #include "path.h" +#include "bitmap.h" #include "scrub.h" #include "common.h" #include "ioctl.h" diff --git a/scrub/phase5.c b/scrub/phase5.c index 7ea8b58..e5a5835 100644 --- a/scrub/phase5.c +++ b/scrub/phase5.c @@ -25,6 +25,7 @@ #include "../repair/threads.h" #include "handle.h" #include "path.h" +#include "bitmap.h" #include "scrub.h" #include "common.h" #include "ioctl.h" diff --git a/scrub/read_verify.c b/scrub/read_verify.c new file mode 100644 index 0000000..18ba73a --- /dev/null +++ b/scrub/read_verify.c @@ -0,0 +1,224 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@xxxxxxxxxx> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#include "libxfs.h" +#include <sys/statvfs.h> +#include <sys/types.h> +#include <dirent.h> +#include "disk.h" +#include "../repair/threads.h" +#include "path.h" +#include "disk.h" +#include "read_verify.h" +#include "scrub.h" +#include "common.h" +#include "counter.h" + +/* + * Read Verify Pool + * + * Manages the data block read verification phase. The caller schedules + * verification requests, which are then scheduled to be run by a thread + * pool worker. Adjacent (or nearly adjacent) requests can be combined + * to reduce overhead when free space fragmentation is high. The thread + * pool takes care of issuing multiple IOs to the device, if possible. + */ + +/* How many bytes have we verified? */ +static struct ptcounter *verified_bytes; + +/* Tolerate 64k holes in adjacent read verify requests. */ +#define IO_BATCH_LOCALITY (65536) + +/* Create a thread pool to run read verifiers. */ +bool +read_verify_pool_init( + struct read_verify_pool **rvpp, + struct scrub_ctx *ctx, + void *readbuf, + size_t readbufsz, + size_t miniosz, + read_verify_ioerr_fn_t ioerr_fn, + unsigned int nproc) +{ + struct read_verify_pool *rvp; + + rvp = calloc(sizeof(struct read_verify_pool), 1); + if (!rvp) + return false; + verified_bytes = ptcounter_init(nproc); + if (!verified_bytes) { + free(rvp); + return false; + } + rvp->rvp_readbuf = readbuf; + rvp->rvp_readbufsz = readbufsz; + rvp->rvp_miniosz = miniosz; + rvp->rvp_ctx = ctx; + rvp->rvp_ioerr_fn = ioerr_fn; + rvp->rvp_nproc = nproc; + create_work_queue(&rvp->rvp_wq, (struct xfs_mount *)rvp, nproc); + *rvpp = rvp; + return true; +} + +/* Finish up any read verification work and tear it down. */ +void +read_verify_pool_destroy( + struct read_verify_pool **rvpp) +{ + struct read_verify_pool *rvp = *rvpp; + + destroy_work_queue(&rvp->rvp_wq); + ptcounter_free(verified_bytes); + verified_bytes = NULL; + *rvpp = NULL; +} + +/* + * Issue a read-verify IO in big batches. + */ +static void +read_verify( + struct work_queue *wq, + xfs_agnumber_t agno, + void *arg) +{ + struct read_verify *rv = arg; + struct read_verify_pool *rvp; + unsigned long long verified = 0; + ssize_t sz; + ssize_t len; + + rvp = (struct read_verify_pool *)wq->mp; + while (rv->io_length > 0) { + len = min(rv->io_length, rvp->rvp_readbufsz); + dbg_printf("diskverify %d %"PRIu64" %zu\n", rv->io_disk->d_fd, + rv->io_start, len); + sz = disk_read_verify(rv->io_disk, rvp->rvp_readbuf, + rv->io_start, len); + if (sz < 0) { + dbg_printf("IOERR %d %"PRIu64" %zu\n", + rv->io_disk->d_fd, + rv->io_start, len); + /* IO error, so try the next logical block. */ + len = rvp->rvp_miniosz; + rvp->rvp_ioerr_fn(rvp, rv->io_disk, rv->io_start, len, + errno, rv->io_end_arg); + } + + verified += len; + rv->io_start += len; + rv->io_length -= len; + } + + free(rv); + ptcounter_add(verified_bytes, verified); +} + +/* Queue a read verify request. */ +static void +read_verify_queue( + struct read_verify_pool *rvp, + struct read_verify *rv) +{ + struct read_verify *tmp; + + dbg_printf("verify fd %d start %"PRIu64" len %"PRIu64"\n", + rv->io_disk->d_fd, rv->io_start, rv->io_length); + + tmp = malloc(sizeof(struct read_verify)); + if (!tmp) { + rvp->rvp_ioerr_fn(rvp, rv->io_disk, rv->io_start, rv->io_length, + errno, rv->io_end_arg); + return; + } + *tmp = *rv; + + queue_work(&rvp->rvp_wq, read_verify, 0, tmp); +} + +/* + * Issue an IO request. We'll batch subsequent requests if they're + * within 64k of each other + */ +void +read_verify_schedule( + struct read_verify_pool *rvp, + struct read_verify *rv, + struct disk *disk, + uint64_t start, + uint64_t length, + void *end_arg) +{ + uint64_t req_end; + uint64_t rv_end; + + assert(rvp->rvp_readbuf); + req_end = start + length; + rv_end = rv->io_start + rv->io_length; + + /* + * If we have a stashed IO, we haven't changed fds, the error + * reporting is the same, and the two extents are close, + * we can combine them. + */ + if (rv->io_length > 0 && disk == rv->io_disk && + end_arg == rv->io_end_arg && + ((start >= rv->io_start && start <= rv_end + IO_BATCH_LOCALITY) || + (rv->io_start >= start && + rv->io_start <= req_end + IO_BATCH_LOCALITY))) { + rv->io_start = min(rv->io_start, start); + rv->io_length = max(req_end, rv_end) - rv->io_start; + } else { + /* Otherwise, issue the stashed IO (if there is one) */ + if (rv->io_length > 0) + read_verify_queue(rvp, rv); + + /* Stash the new IO. */ + rv->io_disk = disk; + rv->io_start = start; + rv->io_length = length; + rv->io_end_arg = end_arg; + } +} + +/* Force any stashed IOs into the verifier. */ +void +read_verify_force( + struct read_verify_pool *rvp, + struct read_verify *rv) +{ + assert(rvp->rvp_readbuf); + if (rv->io_length == 0) + return; + + read_verify_queue(rvp, rv); + rv->io_length = 0; +} + +/* How many bytes has this process verified? */ +unsigned long long +read_verify_bytes(void) +{ + if (!verified_bytes) + return 0; + return ptcounter_value(verified_bytes); +} + diff --git a/scrub/read_verify.h b/scrub/read_verify.h new file mode 100644 index 0000000..59cddd7 --- /dev/null +++ b/scrub/read_verify.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2017 Oracle. All Rights Reserved. + * + * Author: Darrick J. Wong <darrick.wong@xxxxxxxxxx> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it would be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write the Free Software Foundation, + * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301, USA. + */ +#ifndef XFS_SCRUB_READ_VERIFY_H_ +#define XFS_SCRUB_READ_VERIFY_H_ + +struct read_verify_pool; + +/* Function called when an IO error happens. */ +typedef void (*read_verify_ioerr_fn_t)(struct read_verify_pool *rvp, + struct disk *disk, uint64_t start, uint64_t length, + int error, void *arg); + +struct read_verify_pool { + struct work_queue rvp_wq; /* thread pool */ + struct scrub_ctx *rvp_ctx; /* scrub context */ + void *rvp_readbuf; /* read buffer */ + read_verify_ioerr_fn_t rvp_ioerr_fn; /* io error callback */ + size_t rvp_miniosz; /* minimum io size, bytes */ + size_t rvp_readbufsz; /* read buffer size, bytes */ + int rvp_nproc; /* number of threads */ +}; + +bool read_verify_pool_init(struct read_verify_pool **rvpp, struct scrub_ctx *ctx, + void *readbuf, size_t readbufsz, size_t miniosz, + read_verify_ioerr_fn_t ioerr_fn, unsigned int nproc); +void read_verify_pool_destroy(struct read_verify_pool **rvpp); + +struct read_verify { + void *io_end_arg; + struct disk *io_disk; + uint64_t io_start; /* bytes */ + uint64_t io_length; /* bytes */ +}; + +void read_verify_schedule(struct read_verify_pool *rvp, struct read_verify *rv, + struct disk *disk, uint64_t start, uint64_t length, + void *end_arg); +void read_verify_force(struct read_verify_pool *rvp, struct read_verify *rv); +unsigned long long read_verify_bytes(void); + +#endif /* XFS_SCRUB_READ_VERIFY_H_ */ diff --git a/scrub/scrub.c b/scrub/scrub.c index c2385da..d4527e4 100644 --- a/scrub/scrub.c +++ b/scrub/scrub.c @@ -32,6 +32,7 @@ #include "../repair/threads.h" #include "path.h" #include "disk.h" +#include "read_verify.h" #include "scrub.h" #include "common.h" #include "input.h" @@ -251,6 +252,8 @@ phase_start( return false; } + pi->verified_bytes = read_verify_bytes(); + pi->descr = descr; if ((verbose || display_rusage) && descr) { fprintf(stdout, _("Phase %u: %s\n"), phase, descr); @@ -272,11 +275,14 @@ phase_end( struct timeval time_now; char phasebuf[DESCR_BUFSZ]; double dt; + unsigned long long verified; long in, out; long io; double i, o, t; double din, dout, dtot; char *iu, *ou, *tu, *dinu, *doutu, *dtotu; + double v, dv; + char *vu, *dvu; int error; if (!display_rusage) @@ -339,6 +345,15 @@ _("%sI/O: %.1f%s in, %.1f%s out, %.1f%s tot\n"), _("%sI/O rate: %.1f%s/s in, %.1f%s/s out, %.1f%s/s tot\n"), phasebuf, din, dinu, dout, doutu, dtot, dtotu); } + + /* How many bytes were read-verified? */ + verified = read_verify_bytes() - pi->verified_bytes; + if (verified) { + v = auto_space_units(verified, &vu); + dv = auto_space_units(verified / dt, &dvu); + fprintf(stdout, _("Phase %u: Verify: %.1f%s, rate: %.1f%s/s\n"), + phase, v, vu, dv, dvu); + } fflush(stdout); return true; @@ -496,6 +511,7 @@ main( bool ismnt; static bool injected; int ret; + int error; fprintf(stderr, "XXX: This program is not complete!\n"); return 4; @@ -639,6 +655,14 @@ _("Only one of the options -n or -y may be specified.\n")); goto out; } + /* Try to allocate a read buffer if we don't have one. */ + error = posix_memalign((void **)&ctx.readbuf, page_size, + IO_MAX_SIZE); + if (error || !ctx.readbuf) { + str_errno(&ctx, ctx.mntpoint); + goto out; + } + if (debug_tweak_on("XFS_SCRUB_FORCE_REPAIR") && !injected) { ctx.mode = SCRUB_MODE_REPAIR; injected = true; @@ -692,6 +716,7 @@ _("%s: %llu warnings found.\n"), disk_close(&ctx.datadev); free(ctx.blkdev); + free(ctx.readbuf); free(ctx.mntpoint); end: return ret; diff --git a/scrub/scrub.h b/scrub/scrub.h index 87f59d6..0b82d9f 100644 --- a/scrub/scrub.h +++ b/scrub/scrub.h @@ -42,6 +42,15 @@ enum error_action { ERRORS_SHUTDOWN, }; +/* + * Perform all IO in 32M chunks. This cannot exceed 65536 sectors + * because that's the biggest SCSI VERIFY(16) we dare to send. + */ +#define IO_MAX_SIZE 33554432 +#define IO_MAX_SECTORS (IO_MAX_SIZE >> BBSHIFT) + +struct read_verify_pool; + struct scrub_ctx { /* Immutable scrub state. */ @@ -81,8 +90,12 @@ struct scrub_ctx { void *fshandle; size_t fshandle_len; + /* Data block read verification buffer */ + void *readbuf; + /* Mutable scrub state; use lock. */ pthread_mutex_t lock; + struct read_verify_pool *rvp; unsigned long long max_errors; unsigned long long runtime_errors; unsigned long long errors_found; -- To unsubscribe from this list: send the line "unsubscribe linux-xfs" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html