On Tue, Apr 11, 2023 at 03:53:38PM -0700, Darrick J. Wong wrote: > From: Darrick J. Wong <djwong@xxxxxxxxxx> > > While fuzzing the data fork extent count on a btree-format directory > with xfs/375, I observed the following (excerpted) splat: > > XFS: Assertion failed: xfs_isilocked(ip, XFS_ILOCK_EXCL), file: fs/xfs/libxfs/xfs_bmap.c, line: 1208 > ------------[ cut here ]------------ > WARNING: CPU: 0 PID: 43192 at fs/xfs/xfs_message.c:104 assfail+0x46/0x4a [xfs] > Call Trace: > <TASK> > xfs_iread_extents+0x1af/0x210 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd] > xchk_dir_walk+0xb8/0x190 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd] > xchk_parent_count_parent_dentries+0x41/0x80 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd] > xchk_parent_validate+0x199/0x2e0 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd] > xchk_parent+0xdf/0x130 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd] > xfs_scrub_metadata+0x2b8/0x730 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd] > xfs_scrubv_metadata+0x38b/0x4d0 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd] > xfs_ioc_scrubv_metadata+0x111/0x160 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd] > xfs_file_ioctl+0x367/0xf50 [xfs 09f66509ece4938760fac7de64732a0cbd3e39cd] > __x64_sys_ioctl+0x82/0xa0 > do_syscall_64+0x2b/0x80 > entry_SYSCALL_64_after_hwframe+0x46/0xb0 > > The cause of this is a race condition in xfs_ilock_data_map_shared, > which performs an unlocked access to the data fork to guess which lock > mode it needs: > > Thread 0 Thread 1 > > xfs_need_iread_extents > <observe no iext tree> > xfs_ilock(..., ILOCK_EXCL) > xfs_iread_extents > <observe no iext tree> > <check ILOCK_EXCL> > <load bmbt extents into iext> > <notice iext size doesn't > match nextents> > xfs_need_iread_extents > <observe iext tree> > xfs_ilock(..., ILOCK_SHARED) > <tear down iext tree> > xfs_iunlock(..., ILOCK_EXCL) > xfs_iread_extents > <observe no iext tree> > <check ILOCK_EXCL> > *BOOM* > > Fix this race by adding a flag to the xfs_ifork structure to indicate > that we have not yet read in the extent records and changing the > predicate to look at the flag state, not if_height. The memory barrier > ensures that the flag will not be set until the very end of the > function. > > Signed-off-by: Darrick J. Wong <djwong@xxxxxxxxxx> > --- > Here's the userspace version, which steals heavily from the kernel but > otherwise uses liburcu underneath. > --- > include/atomic.h | 100 +++++++++++++++++++++++++++++++++++++++++++++++ > libxfs/libxfs_priv.h | 3 - > libxfs/xfs_bmap.c | 6 +++ > libxfs/xfs_inode_fork.c | 16 +++++++- > libxfs/xfs_inode_fork.h | 6 ++- > 5 files changed, 125 insertions(+), 6 deletions(-) > > diff --git a/include/atomic.h b/include/atomic.h > index 9c4aa5849ae..98889190bb0 100644 > --- a/include/atomic.h > +++ b/include/atomic.h > @@ -118,4 +118,104 @@ atomic64_set(atomic64_t *a, int64_t v) > > #endif /* HAVE_URCU_ATOMIC64 */ > > +#define __smp_mb() cmm_smp_mb() > + > +/* from compiler_types.h */ > +/* > + * __unqual_scalar_typeof(x) - Declare an unqualified scalar type, leaving > + * non-scalar types unchanged. > + */ > +/* > + * Prefer C11 _Generic for better compile-times and simpler code. Note: 'char' > + * is not type-compatible with 'signed char', and we define a separate case. > + */ > +#define __scalar_type_to_expr_cases(type) \ > + unsigned type: (unsigned type)0, \ > + signed type: (signed type)0 > + > +#define __unqual_scalar_typeof(x) typeof( \ > + _Generic((x), \ > + char: (char)0, \ > + __scalar_type_to_expr_cases(char), \ > + __scalar_type_to_expr_cases(short), \ > + __scalar_type_to_expr_cases(int), \ > + __scalar_type_to_expr_cases(long), \ > + __scalar_type_to_expr_cases(long long), \ > + default: (x))) > + > +/* Is this type a native word size -- useful for atomic operations */ > +#define __native_word(t) \ > + (sizeof(t) == sizeof(char) || sizeof(t) == sizeof(short) || \ > + sizeof(t) == sizeof(int) || sizeof(t) == sizeof(long)) > + > +#define compiletime_assert(foo, str) BUILD_BUG_ON(!(foo)) > + > +#define compiletime_assert_atomic_type(t) \ > + compiletime_assert(__native_word(t), \ > + "Need native word sized stores/loads for atomicity.") > + > +/* from barrier.h */ > +#ifndef __smp_store_release > +#define __smp_store_release(p, v) \ > +do { \ > + compiletime_assert_atomic_type(*p); \ > + __smp_mb(); \ > + WRITE_ONCE(*p, v); \ > +} while (0) > +#endif > + > +#ifndef __smp_load_acquire > +#define __smp_load_acquire(p) \ > +({ \ > + __unqual_scalar_typeof(*p) ___p1 = READ_ONCE(*p); \ > + compiletime_assert_atomic_type(*p); \ > + __smp_mb(); \ > + (typeof(*p))___p1; \ > +}) > +#endif > + > +#ifndef smp_store_release > +#define smp_store_release(p, v) __smp_store_release((p), (v)) > +#endif > + > +#ifndef smp_load_acquire > +#define smp_load_acquire(p) __smp_load_acquire(p) > +#endif > + > +/* from rwonce.h */ > +/* > + * Yes, this permits 64-bit accesses on 32-bit architectures. These will > + * actually be atomic in some cases (namely Armv7 + LPAE), but for others we > + * rely on the access being split into 2x32-bit accesses for a 32-bit quantity > + * (e.g. a virtual address) and a strong prevailing wind. > + */ > +#define compiletime_assert_rwonce_type(t) \ > + compiletime_assert(__native_word(t) || sizeof(t) == sizeof(long long), \ > + "Unsupported access size for {READ,WRITE}_ONCE().") > + > +/* > + * Use __READ_ONCE() instead of READ_ONCE() if you do not require any > + * atomicity. Note that this may result in tears! > + */ > +#ifndef __READ_ONCE > +#define __READ_ONCE(x) (*(const volatile __unqual_scalar_typeof(x) *)&(x)) > +#endif > + > +#define READ_ONCE(x) \ > +({ \ > + compiletime_assert_rwonce_type(x); \ > + __READ_ONCE(x); \ > +}) > + > +#define __WRITE_ONCE(x, val) \ > +do { \ > + *(volatile typeof(x) *)&(x) = (val); \ > +} while (0) > + > +#define WRITE_ONCE(x, val) \ > +do { \ > + compiletime_assert_rwonce_type(x); \ > + __WRITE_ONCE(x, val); \ > +} while (0) > + > #endif /* __ATOMIC_H__ */ I'd put READ/WRITE_ONCE above the barrier stuff as __smp_store_release/__smp_load_acquire use it, but other than that it looks OK. > diff --git a/libxfs/libxfs_priv.h b/libxfs/libxfs_priv.h > index e5f37df28c0..5fdfeeea060 100644 > --- a/libxfs/libxfs_priv.h > +++ b/libxfs/libxfs_priv.h > @@ -222,9 +222,6 @@ static inline bool WARN_ON(bool expr) { > #define percpu_counter_read_positive(x) ((*x) > 0 ? (*x) : 0) > #define percpu_counter_sum(x) (*x) > > -#define READ_ONCE(x) (x) > -#define WRITE_ONCE(x, val) ((x) = (val)) Yay! One less nasty hack for the userspace wrappers. Overall looks good to me. Reviewed-by: Dave Chinner <dchinner@xxxxxxxxxx> -- Dave Chinner david@xxxxxxxxxxxxx