Re: [PATCH 02/11] xfs_scrub: improve reporting of file data media errors

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On Mon, Oct 21, 2019 at 11:33:59AM -0500, Eric Sandeen wrote:
> On 9/25/19 4:36 PM, Darrick J. Wong wrote:
> > From: Darrick J. Wong <darrick.wong@xxxxxxxxxx>
> > 
> > When we report media errors, we should tell the administrator the file
> > offset and length of the bad region, not just the offset of the entire
> > file extent record that overlaps a bad region.
> > 
> > Signed-off-by: Darrick J. Wong <darrick.wong@xxxxxxxxxx>
> > ---
> >  libfrog/bitmap.c |   37 +++++++++++++++++++++++++++++++++++++
> >  libfrog/bitmap.h |    2 ++
> >  scrub/phase6.c   |   52 +++++++++++++++++++++++++++++++++++++++++++++++-----
> >  3 files changed, 86 insertions(+), 5 deletions(-)
> > 
> > 
> > diff --git a/libfrog/bitmap.c b/libfrog/bitmap.c
> > index a75d085a..6a88ef48 100644
> > --- a/libfrog/bitmap.c
> > +++ b/libfrog/bitmap.c
> > @@ -339,6 +339,43 @@ bitmap_iterate(
> >  }
> >  #endif
> >  
> > +/* Iterate the set regions of part of this bitmap. */
> > +int
> > +bitmap_iterate_range(
> > +	struct bitmap		*bmap,
> > +	uint64_t		start,
> > +	uint64_t		length,
> > +	int			(*fn)(uint64_t, uint64_t, void *),
> > +	void			*arg)
> > +{
> > +	struct avl64node	*firstn;
> > +	struct avl64node	*lastn;
> > +	struct avl64node	*pos;
> > +	struct avl64node	*n;
> > +	struct avl64node	*l;
> > +	struct bitmap_node	*ext;
> > +	int			ret = 0;
> > +
> > +	pthread_mutex_lock(&bmap->bt_lock);
> > +
> > +	avl64_findranges(bmap->bt_tree, start, start + length, &firstn,
> > +			&lastn);
> > +
> > +	if (firstn == NULL && lastn == NULL)
> > +		goto out;
> > +
> > +	avl_for_each_range_safe(pos, n, l, firstn, lastn) {
> > +		ext = container_of(pos, struct bitmap_node, btn_node);
> > +		ret = fn(ext->btn_start, ext->btn_length, arg);
> > +		if (ret)
> > +			break;
> > +	}
> > +
> > +out:
> > +	pthread_mutex_unlock(&bmap->bt_lock);
> > +	return ret;
> > +}
> > +
> >  /* Do any bitmap extents overlap the given one?  (locked) */
> >  static bool
> >  __bitmap_test(
> > diff --git a/libfrog/bitmap.h b/libfrog/bitmap.h
> > index 759386a8..043b77ee 100644
> > --- a/libfrog/bitmap.h
> > +++ b/libfrog/bitmap.h
> > @@ -16,6 +16,8 @@ void bitmap_free(struct bitmap **bmap);
> >  int bitmap_set(struct bitmap *bmap, uint64_t start, uint64_t length);
> >  int bitmap_iterate(struct bitmap *bmap, int (*fn)(uint64_t, uint64_t, void *),
> >  		void *arg);
> > +int bitmap_iterate_range(struct bitmap *bmap, uint64_t start, uint64_t length,
> > +		int (*fn)(uint64_t, uint64_t, void *), void *arg);
> >  bool bitmap_test(struct bitmap *bmap, uint64_t start,
> >  		uint64_t len);
> >  bool bitmap_empty(struct bitmap *bmap);
> > diff --git a/scrub/phase6.c b/scrub/phase6.c
> > index 1edd98af..a16ad114 100644
> > --- a/scrub/phase6.c
> > +++ b/scrub/phase6.c
> > @@ -111,6 +111,41 @@ xfs_decode_special_owner(
> >  
> >  /* Routines to translate bad physical extents into file paths and offsets. */
> >  
> > +struct badfile_report {
> > +	struct scrub_ctx	*ctx;
> > +	const char		*descr;
> > +	struct xfs_bmap		*bmap;
> > +};
> > +
> > +/* Report on bad extents found during a media scan. */
> > +static int
> > +report_badfile(
> > +	uint64_t		start,
> > +	uint64_t		length,
> > +	void			*arg)
> > +{
> > +	struct badfile_report	*br = arg;
> > +	unsigned long long	bad_offset;
> > +	unsigned long long	bad_length;
> > +
> > +	/* Clamp the bad region to the file mapping. */
> > +	if (start < br->bmap->bm_physical) {
> > +		length -= br->bmap->bm_physical - start;
> > +		start = br->bmap->bm_physical;
> > +	}
> > +	length = min(length, br->bmap->bm_length);
> > +
> > +	/* Figure out how far into the bmap is the bad mapping and report it. */
> > +	bad_offset = start - br->bmap->bm_physical;
> > +	bad_length = min(start + length,
> > +			 br->bmap->bm_physical + br->bmap->bm_length) - start;
> > +
> > +	str_error(br->ctx, br->descr,
> > +_("media error at data offset %llu length %llu."),
> > +			br->bmap->bm_offset + bad_offset, bad_length);
> > +	return 0;
> > +}
> > +
> >  /* Report if this extent overlaps a bad region. */
> >  static bool
> >  report_data_loss(
> > @@ -122,8 +157,14 @@ report_data_loss(
> >  	struct xfs_bmap			*bmap,
> >  	void				*arg)
> >  {
> > +	struct badfile_report		br = {
> > +		.ctx			= ctx,
> > +		.descr			= descr,
> > +		.bmap			= bmap,
> > +	};
> >  	struct media_verify_state	*vs = arg;
> >  	struct bitmap			*bmp;
> > +	int				ret;
> >  
> >  	/* Only report errors for real extents. */
> >  	if (bmap->bm_flags & (BMV_OF_PREALLOC | BMV_OF_DELALLOC))
> > @@ -134,11 +175,12 @@ report_data_loss(
> >  	else
> >  		bmp = vs->d_bad;
> >  
> > -	if (!bitmap_test(bmp, bmap->bm_physical, bmap->bm_length))
> > -		return true;
> > -
> > -	str_error(ctx, descr,
> > -_("offset %llu failed read verification."), bmap->bm_offset);
> > +	ret = bitmap_iterate_range(bmp, bmap->bm_physical, bmap->bm_length,
> > +			report_badfile, &br);
> > +	if (ret) {
> > +		str_liberror(ctx, ret, descr);
> > +		return false;
> 
> So related to the prior question; why does changing the way we report a
> media error in a file change the flow with a "return false" here?

The bitmap iteration itself failed (since report_badfile never returns
false), which means it hit a runtime error and the program should exit.

(Granted, bitmap iteration never fails, so this is merely defensive
programming.)

--D

> > +	}
> >  	return true;
> >  }
> >  
> > 



[Index of Archives]     [XFS Filesystem Development (older mail)]     [Linux Filesystem Development]     [Linux Audio Users]     [Yosemite Trails]     [Linux Kernel]     [Linux RAID]     [Linux SCSI]


  Powered by Linux