On Fri, Feb 12, 2021 at 09:46:56PM -0800, Darrick J. Wong wrote: > From: Darrick J. Wong <djwong@xxxxxxxxxx> > > Add an error injection knob so that we can simulate system failure after > a certain number of disk writes. This knob is being added so that we > can check repair's behavior after an arbitrary number of tests. > > Set LIBXFS_DEBUG_WRITE_CRASH={ddev,logdev,rtdev}=nn in the environment > to make libxfs SIGKILL itself after nn writes to the data, log, or rt > devices. Note that this only applies to xfs_buf writes and zero_range. > > Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx> > --- > libxfs/init.c | 68 +++++++++++++++++++++++++++++++++++++++++++++++++--- > libxfs/libxfs_io.h | 19 +++++++++++++++ > libxfs/rdwr.c | 6 ++++- > 3 files changed, 88 insertions(+), 5 deletions(-) > > > diff --git a/libxfs/init.c b/libxfs/init.c > index 8a8ce3c4..1ec83791 100644 > --- a/libxfs/init.c > +++ b/libxfs/init.c ... > @@ -614,6 +634,46 @@ libxfs_buftarg_init( > dev_t logdev, > dev_t rtdev) > { > + char *p = getenv("LIBXFS_DEBUG_WRITE_CRASH"); > + unsigned long dfail = 0, lfail = 0, rfail = 0; Was there a reason for using an environment variable now rather than the original command line option? > + > + /* Simulate utility crash after a certain number of writes. */ > + while (p && *p) { > + char *val; > + > + switch (getsubopt(&p, wf_opts, &val)) { > + case WF_DATA: > + if (!val) { > + fprintf(stderr, > + _("ddev write fail requires a parameter\n")); > + exit(1); > + } > + dfail = strtoul(val, NULL, 0); > + break; > + case WF_LOG: > + if (!val) { > + fprintf(stderr, > + _("logdev write fail requires a parameter\n")); > + exit(1); > + } > + lfail = strtoul(val, NULL, 0); > + break; > + case WF_RT: > + if (!val) { > + fprintf(stderr, > + _("rtdev write fail requires a parameter\n")); > + exit(1); > + } > + rfail = strtoul(val, NULL, 0); > + break; > + default: > + fprintf(stderr, _("unknown write fail type %s\n"), > + val); > + exit(1); > + break; > + } > + } > + > if (mp->m_ddev_targp) { > /* should already have all buftargs initialised */ > if (mp->m_ddev_targp->bt_bdev != dev || ... > diff --git a/libxfs/libxfs_io.h b/libxfs/libxfs_io.h > index c80e2d59..85485257 100644 > --- a/libxfs/libxfs_io.h > +++ b/libxfs/libxfs_io.h ... > @@ -30,6 +32,23 @@ struct xfs_buftarg { > #define XFS_BUFTARG_LOST_WRITE (1 << 0) > /* A dirty buffer failed the write verifier. */ > #define XFS_BUFTARG_CORRUPT_WRITE (1 << 1) > +/* Simulate failure after a certain number of writes. */ > +#define XFS_BUFTARG_INJECT_WRITE_FAIL (1 << 2) > + > +/* Simulate the system crashing after a write. */ > +static inline void > +xfs_buftarg_trip_write( > + struct xfs_buftarg *btp) > +{ > + if (!(btp->flags & XFS_BUFTARG_INJECT_WRITE_FAIL)) > + return; > + > + pthread_mutex_lock(&btp->lock); > + btp->writes_left--; > + if (!btp->writes_left) > + kill(getpid(), SIGKILL); Can we just exit()? (Same questions for the next patch..) Brian > + pthread_mutex_unlock(&btp->lock); > +} > > extern void libxfs_buftarg_init(struct xfs_mount *mp, dev_t ddev, > dev_t logdev, dev_t rtdev); > diff --git a/libxfs/rdwr.c b/libxfs/rdwr.c > index ca272387..fd456d6b 100644 > --- a/libxfs/rdwr.c > +++ b/libxfs/rdwr.c > @@ -74,8 +74,10 @@ libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len) > /* try to use special zeroing methods, fall back to writes if needed */ > len_bytes = LIBXFS_BBTOOFF64(len); > error = platform_zero_range(fd, start_offset, len_bytes); > - if (!error) > + if (!error) { > + xfs_buftarg_trip_write(btp); > return 0; > + } > > zsize = min(BDSTRAT_SIZE, BBTOB(len)); > if ((z = memalign(libxfs_device_alignment(), zsize)) == NULL) { > @@ -105,6 +107,7 @@ libxfs_device_zero(struct xfs_buftarg *btp, xfs_daddr_t start, uint len) > progname, __FUNCTION__); > exit(1); > } > + xfs_buftarg_trip_write(btp); > offset += bytes; > } > free(z); > @@ -860,6 +863,7 @@ libxfs_bwrite( > } else { > bp->b_flags |= LIBXFS_B_UPTODATE; > bp->b_flags &= ~(LIBXFS_B_DIRTY | LIBXFS_B_UNCHECKED); > + xfs_buftarg_trip_write(bp->b_target); > } > return bp->b_error; > } >