Re: [PATCH] inode: remove __I_DIO_WAKEUP

Dave Chinner <david@xxxxxxxxxxxxx> · Sun, 18 Aug 2024 09:48:48 +1000

On Fri, Aug 16, 2024 at 04:35:52PM +0200, Christian Brauner wrote:
> Afaict, we can just rely on inode->i_dio_count for waiting instead of
> this awkward indirection through __I_DIO_WAKEUP. This survives LTP dio
> and xfstests dio tests.
> 
> Signed-off-by: Christian Brauner <brauner@xxxxxxxxxx>
> ---
> ---
>  fs/inode.c         | 23 +++++++++++------------
>  fs/netfs/locking.c | 18 +++---------------
>  include/linux/fs.h |  9 ++++-----
>  3 files changed, 18 insertions(+), 32 deletions(-)
> 
> diff --git a/fs/inode.c b/fs/inode.c
> index 7a4e27606fca..46bf05d826db 100644
> --- a/fs/inode.c
> +++ b/fs/inode.c
> @@ -2465,18 +2465,12 @@ EXPORT_SYMBOL(inode_owner_or_capable);
>  /*
>   * Direct i/o helper functions
>   */
> -static void __inode_dio_wait(struct inode *inode)
> +bool inode_dio_finished(const struct inode *inode)
>  {
> -	wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
> -	DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
> -
> -	do {
> -		prepare_to_wait(wq, &q.wq_entry, TASK_UNINTERRUPTIBLE);
> -		if (atomic_read(&inode->i_dio_count))
> -			schedule();
> -	} while (atomic_read(&inode->i_dio_count));
> -	finish_wait(wq, &q.wq_entry);
> +	smp_mb__before_atomic();
> +	return atomic_read(&inode->i_dio_count) == 0;
>  }
> +EXPORT_SYMBOL(inode_dio_finished);

What is the memory barrier here for?  i_dio_count is not a reference
count - there are no dependent inode object changes before/after it
changes that it needs to be ordered against, so I'm not sure what
the purpose of this memory barrier is supposed to be...

Memory barriers -always- need a comment describing the race
condition they are avoiding.

>  /**
>   * inode_dio_wait - wait for outstanding DIO requests to finish
> @@ -2490,11 +2484,16 @@ static void __inode_dio_wait(struct inode *inode)
>   */
>  void inode_dio_wait(struct inode *inode)
>  {
> -	if (atomic_read(&inode->i_dio_count))
> -		__inode_dio_wait(inode);
> +	wait_var_event(&inode->i_dio_count, inode_dio_finished);
>  }
>  EXPORT_SYMBOL(inode_dio_wait);
>  
> +void inode_dio_wait_interruptible(struct inode *inode)
> +{
> +	wait_var_event_interruptible(&inode->i_dio_count, inode_dio_finished);
> +}
> +EXPORT_SYMBOL(inode_dio_wait_interruptible);

Keep in mind that the prepare_to_wait() call inside
wait_var_event_interruptible() takes the waitqueue head lock before
checking the condition. This provides the necessary memory barriers
to prevent wait/wakeup races checking the inode_dio_finished()
state. Hence, AFAICT, no memory barriers are needed in
inode_dio_finished() at all....

>  /*
>   * inode_set_flags - atomically set some inode flags
>   *
> diff --git a/fs/netfs/locking.c b/fs/netfs/locking.c
> index 75dc52a49b3a..c2cfdda85230 100644
> --- a/fs/netfs/locking.c
> +++ b/fs/netfs/locking.c
> @@ -21,23 +21,11 @@
>   */
>  static int inode_dio_wait_interruptible(struct inode *inode)
>  {
> -	if (!atomic_read(&inode->i_dio_count))
> +	if (inode_dio_finished(inode))
>  		return 0;
>  
> -	wait_queue_head_t *wq = bit_waitqueue(&inode->i_state, __I_DIO_WAKEUP);
> -	DEFINE_WAIT_BIT(q, &inode->i_state, __I_DIO_WAKEUP);
> -
> -	for (;;) {
> -		prepare_to_wait(wq, &q.wq_entry, TASK_INTERRUPTIBLE);
> -		if (!atomic_read(&inode->i_dio_count))
> -			break;
> -		if (signal_pending(current))
> -			break;
> -		schedule();
> -	}
> -	finish_wait(wq, &q.wq_entry);
> -
> -	return atomic_read(&inode->i_dio_count) ? -ERESTARTSYS : 0;
> +	inode_dio_wait_interruptible(inode);
> +	return !inode_dio_finished(inode) ? -ERESTARTSYS : 0;

That looks broken. We have a private static function calling an
exported function of the same name. I suspect that this static
function needs to be named "netfs_dio_wait_interruptible()"....

> @@ -2413,8 +2411,6 @@ static inline void kiocb_clone(struct kiocb *kiocb, struct kiocb *kiocb_src,
>  #define __I_SYNC		7
>  #define I_SYNC			(1 << __I_SYNC)
>  #define I_REFERENCED		(1 << 8)
> -#define __I_DIO_WAKEUP		9
> -#define I_DIO_WAKEUP		(1 << __I_DIO_WAKEUP)
>  #define I_LINKABLE		(1 << 10)
>  #define I_DIRTY_TIME		(1 << 11)
>  #define I_WB_SWITCH		(1 << 13)
> @@ -3230,6 +3226,7 @@ static inline ssize_t blockdev_direct_IO(struct kiocb *iocb,
>  #endif
>  
>  void inode_dio_wait(struct inode *inode);
> +void inode_dio_wait_interruptible(struct inode *inode);
>  
>  /**
>   * inode_dio_begin - signal start of a direct I/O requests
> @@ -3241,6 +3238,7 @@ void inode_dio_wait(struct inode *inode);
>  static inline void inode_dio_begin(struct inode *inode)
>  {
>  	atomic_inc(&inode->i_dio_count);
> +	smp_mb__after_atomic();
>  }

Again I have no idea waht this barrier is doing. Why does this
operation need quasi-release semantics?

>  /**
> @@ -3252,8 +3250,9 @@ static inline void inode_dio_begin(struct inode *inode)
>   */
>  static inline void inode_dio_end(struct inode *inode)
>  {
> +	smp_mb__before_atomic();
>  	if (atomic_dec_and_test(&inode->i_dio_count))
> -		wake_up_bit(&inode->i_state, __I_DIO_WAKEUP);
> +		wake_up_var(&inode->i_dio_count);
>  }

atomic_dec_and_test() is a RMW atomic operation with a return value,
so has has fully ordered semanitcs according to
Documentation/atomic_t.txt:

	 - RMW operations that have a return value are fully ordered.
	[...]
	Fully ordered primitives are ordered against everything prior and everything
	subsequent. Therefore a fully ordered primitive is like having an smp_mb()
	before and an smp_mb() after the primitive.

So there's never a need for explicit barriers before/after an
atomic_dec_and_test() operation, right?

-Dave.
-- 
Dave Chinner
david@xxxxxxxxxxxxx