On Wed, Jul 03, 2019 at 10:01:37AM -0700, Dan Williams wrote: > On Wed, Jul 3, 2019 at 5:17 AM Matthew Wilcox <willy@xxxxxxxxxxxxx> wrote: > > > > On Wed, Jul 03, 2019 at 12:24:54AM -0700, Dan Williams wrote: > > > This fix may increase waitqueue contention, but a fix for that is saved > > > for a larger rework. In the meantime this fix is suitable for -stable > > > backports. > > > > I think this is too big for what it is; just the two-line patch to stop > > incorporating the low bits of the PTE would be more appropriate. > > Sufficient, yes, "appropriate", not so sure. All those comments about > pmd entry size are stale after this change. But then they'll have to be put back in again. This seems to be working for me, although I doubt I'm actually hitting the edge case that rocksdb hits: diff --git a/fs/dax.c b/fs/dax.c index 2e48c7ebb973..e77bd6aef10c 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -198,6 +198,10 @@ static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all) * if it did. * * Must be called with the i_pages lock held. + * + * If the xa_state refers to a larger entry, then it may return a locked + * smaller entry (eg a PTE entry) without waiting for the smaller entry + * to be unlocked. */ static void *get_unlocked_entry(struct xa_state *xas) { @@ -211,7 +215,8 @@ static void *get_unlocked_entry(struct xa_state *xas) for (;;) { entry = xas_find_conflict(xas); if (!entry || WARN_ON_ONCE(!xa_is_value(entry)) || - !dax_is_locked(entry)) + !dax_is_locked(entry) || + dax_entry_order(entry) < xas_get_order(xas)) return entry; wq = dax_entry_waitqueue(xas, entry, &ewait.key); @@ -253,8 +258,12 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry) static void put_unlocked_entry(struct xa_state *xas, void *entry) { - /* If we were the only waiter woken, wake the next one */ - if (entry) + /* + * If we were the only waiter woken, wake the next one. + * Do not wake anybody if the entry is locked; that indicates + * we weren't woken. + */ + if (entry && !dax_is_locked(entry)) dax_wake_entry(xas, entry, false); } diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 052e06ff4c36..b17289d92af4 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1529,6 +1529,27 @@ static inline void xas_set_order(struct xa_state *xas, unsigned long index, #endif } +/** + * xas_get_order() - Get the order of the entry being operated on. + * @xas: XArray operation state. + * + * Return: The order of the entry. + */ +static inline unsigned int xas_get_order(const struct xa_state *xas) +{ + unsigned int order = xas->xa_shift; + +#ifdef CONFIG_XARRAY_MULTI + unsigned int sibs = xas->xa_sibs; + + while (sibs) { + order++; + sibs /= 2; + } +#endif + return order; +} + /** * xas_set_update() - Set up XArray operation state for a callback. * @xas: XArray operation state. diff --git a/lib/test_xarray.c b/lib/test_xarray.c index 7df4f7f395bf..af024477ec93 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -95,6 +95,17 @@ static noinline void check_xa_err(struct xarray *xa) // XA_BUG_ON(xa, xa_err(xa_store(xa, 0, xa_mk_internal(0), 0)) != -EINVAL); } +static noinline void check_xas_order(struct xarray *xa) +{ + XA_STATE(xas, xa, 0); + unsigned int i; + + for (i = 0; i < sizeof(long) * 8; i++) { + xas_set_order(&xas, 0, i); + XA_BUG_ON(xa, xas_get_order(&xas) != i); + } +} + static noinline void check_xas_retry(struct xarray *xa) { XA_STATE(xas, xa, 0); @@ -1583,6 +1594,7 @@ static DEFINE_XARRAY(array); static int xarray_checks(void) { check_xa_err(&array); + check_xas_order(&array); check_xas_retry(&array); check_xa_load(&array); check_xa_mark(&array);