On Thu, 2025-03-13 at 23:33 +0000, David Howells wrote: > Add some support for RMW in ceph: > > (1) Add netfs_unbuffered_read_from_inode() to allow reading from an inode > without having a file pointer so that truncate can modify a > now-partial tail block of a content-encrypted file. > > This takes an additional argument to cause it to fail or give a short > read if a hole is encountered. This is noted on the request with > NETFS_RREQ_NO_READ_HOLE for the filesystem to pick up. > > (2) Set NETFS_RREQ_RMW when doing an RMW as part of a request. > > (3) Provide a ->rmw_read_done() op for netfslib to tell the filesystem > that it has completed the read required for RMW. > > Signed-off-by: David Howells <dhowells@xxxxxxxxxx> > cc: Jeff Layton <jlayton@xxxxxxxxxx> > cc: Viacheslav Dubeyko <slava@xxxxxxxxxxx> > cc: Alex Markuze <amarkuze@xxxxxxxxxx> > cc: Ilya Dryomov <idryomov@xxxxxxxxx> > cc: ceph-devel@xxxxxxxxxxxxxxx > cc: linux-fsdevel@xxxxxxxxxxxxxxx > --- > fs/netfs/direct_read.c | 75 ++++++++++++++++++++++++++++++++++++ > fs/netfs/direct_write.c | 1 + > fs/netfs/main.c | 1 + > fs/netfs/objects.c | 1 + > fs/netfs/read_collect.c | 2 + > fs/netfs/write_retry.c | 3 ++ > include/linux/netfs.h | 7 ++++ > include/trace/events/netfs.h | 3 ++ > 8 files changed, 93 insertions(+) > > diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c > index 5e4bd1e5a378..4061f934dfe6 100644 > --- a/fs/netfs/direct_read.c > +++ b/fs/netfs/direct_read.c > @@ -373,3 +373,78 @@ ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter) > return ret; > } > EXPORT_SYMBOL(netfs_unbuffered_read_iter); > + > +/** > + * netfs_unbuffered_read_from_inode - Perform an unbuffered sync I/O read > + * @inode: The inode being accessed > + * @pos: The file position to read from > + * @iter: The output buffer (also specifies read length) > + * @nohole: True to return short/ENODATA if hole encountered > + * > + * Perform a synchronous unbuffered I/O from the inode to the output buffer. > + * No use is made of the pagecache. The output buffer must be suitably aligned > + * if content encryption is to be used. If @nohole is true then the read will > + * stop short if a hole is encountered and return -ENODATA if the read begins > + * with a hole. > + * > + * The caller must hold any appropriate locks. > + */ > +ssize_t netfs_unbuffered_read_from_inode(struct inode *inode, loff_t pos, > + struct iov_iter *iter, bool nohole) > +{ > + struct netfs_io_request *rreq; > + ssize_t ret; > + size_t orig_count = iov_iter_count(iter); > + > + _enter(""); > + > + if (WARN_ON(user_backed_iter(iter))) > + return -EIO; > + > + if (!orig_count) > + return 0; /* Don't update atime */ > + > + ret = filemap_write_and_wait_range(inode->i_mapping, pos, orig_count); > + if (ret < 0) > + return ret; > + inode_update_time(inode, S_ATIME); > + > + rreq = netfs_alloc_request(inode->i_mapping, NULL, pos, orig_count, > + NULL, NETFS_UNBUFFERED_READ); > + if (IS_ERR(rreq)) > + return PTR_ERR(rreq); > + > + ret = -EIO; > + if (test_bit(NETFS_RREQ_CONTENT_ENCRYPTION, &rreq->flags) && > + WARN_ON(!netfs_is_crypto_aligned(rreq, iter))) > + goto out; > + > + netfs_stat(&netfs_n_rh_dio_read); > + trace_netfs_read(rreq, rreq->start, rreq->len, > + netfs_read_trace_unbuffered_read_from_inode); > + > + rreq->buffer.iter = *iter; The struct iov_iter structure is complex enough and we assign it by value to rreq->buffer.iter. So, the initial pointer will not receive any changes then. Is it desired behavior here? Thanks, Slava. > + rreq->len = orig_count; > + rreq->direct_bv_unpin = false; > + iov_iter_advance(iter, orig_count); > + > + if (nohole) > + __set_bit(NETFS_RREQ_NO_READ_HOLE, &rreq->flags); > + > + /* We're going to do the crypto in place in the destination buffer. */ > + if (test_bit(NETFS_RREQ_CONTENT_ENCRYPTION, &rreq->flags)) > + __set_bit(NETFS_RREQ_CRYPT_IN_PLACE, &rreq->flags); > + > + ret = netfs_dispatch_unbuffered_reads(rreq); > + > + if (!rreq->submitted) { > + netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit); > + goto out; > + } > + > + ret = netfs_wait_for_read(rreq); > +out: > + netfs_put_request(rreq, false, netfs_rreq_trace_put_return); > + return ret; > +} > +EXPORT_SYMBOL(netfs_unbuffered_read_from_inode); > diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c > index 83c5c06c4710..a99722f90c71 100644 > --- a/fs/netfs/direct_write.c > +++ b/fs/netfs/direct_write.c > @@ -145,6 +145,7 @@ static ssize_t netfs_write_through_bounce_buffer(struct netfs_io_request *wreq, > wreq->start = gstart; > wreq->len = gend - gstart; > > + __set_bit(NETFS_RREQ_RMW, &ictx->flags); > if (gstart >= end) { > /* At or after EOF, nothing to read. */ > } else { > diff --git a/fs/netfs/main.c b/fs/netfs/main.c > index 07f8cffbda8c..0900dea53e4a 100644 > --- a/fs/netfs/main.c > +++ b/fs/netfs/main.c > @@ -39,6 +39,7 @@ static const char *netfs_origins[nr__netfs_io_origin] = { > [NETFS_READ_GAPS] = "RG", > [NETFS_READ_SINGLE] = "R1", > [NETFS_READ_FOR_WRITE] = "RW", > + [NETFS_UNBUFFERED_READ] = "UR", > [NETFS_DIO_READ] = "DR", > [NETFS_WRITEBACK] = "WB", > [NETFS_WRITEBACK_SINGLE] = "W1", > diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c > index 4606e830c116..958c4d460d07 100644 > --- a/fs/netfs/objects.c > +++ b/fs/netfs/objects.c > @@ -60,6 +60,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, > origin == NETFS_READ_GAPS || > origin == NETFS_READ_SINGLE || > origin == NETFS_READ_FOR_WRITE || > + origin == NETFS_UNBUFFERED_READ || > origin == NETFS_DIO_READ) { > INIT_WORK(&rreq->work, netfs_read_collection_worker); > rreq->io_streams[0].avail = true; > diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c > index 0a0bff90ca9e..013a90738dcd 100644 > --- a/fs/netfs/read_collect.c > +++ b/fs/netfs/read_collect.c > @@ -462,6 +462,7 @@ static void netfs_read_collection(struct netfs_io_request *rreq) > //netfs_rreq_is_still_valid(rreq); > > switch (rreq->origin) { > + case NETFS_UNBUFFERED_READ: > case NETFS_DIO_READ: > case NETFS_READ_GAPS: > case NETFS_RMW_READ: > @@ -681,6 +682,7 @@ ssize_t netfs_wait_for_read(struct netfs_io_request *rreq) > if (ret == 0) { > ret = rreq->transferred; > switch (rreq->origin) { > + case NETFS_UNBUFFERED_READ: > case NETFS_DIO_READ: > case NETFS_READ_SINGLE: > ret = rreq->transferred; > diff --git a/fs/netfs/write_retry.c b/fs/netfs/write_retry.c > index f727b48e2bfe..9e4e79d5a403 100644 > --- a/fs/netfs/write_retry.c > +++ b/fs/netfs/write_retry.c > @@ -386,6 +386,9 @@ ssize_t netfs_rmw_read(struct netfs_io_request *wreq, struct file *file, > ret = 0; > } > > + if (ret == 0 && rreq->netfs_ops->rmw_read_done) > + rreq->netfs_ops->rmw_read_done(wreq, rreq); > + > error: > netfs_put_request(rreq, false, netfs_rreq_trace_put_return); > return ret; > diff --git a/include/linux/netfs.h b/include/linux/netfs.h > index 9d17d4bd9753..4049c985b9b4 100644 > --- a/include/linux/netfs.h > +++ b/include/linux/netfs.h > @@ -220,6 +220,7 @@ enum netfs_io_origin { > NETFS_READ_GAPS, /* This read is a synchronous read to fill gaps */ > NETFS_READ_SINGLE, /* This read should be treated as a single object */ > NETFS_READ_FOR_WRITE, /* This read is to prepare a write */ > + NETFS_UNBUFFERED_READ, /* This is an unbuffered I/O read */ > NETFS_DIO_READ, /* This is a direct I/O read */ > NETFS_WRITEBACK, /* This write was triggered by writepages */ > NETFS_WRITEBACK_SINGLE, /* This monolithic write was triggered by writepages */ > @@ -308,6 +309,9 @@ struct netfs_io_request { > #define NETFS_RREQ_CONTENT_ENCRYPTION 16 /* Content encryption is in use */ > #define NETFS_RREQ_CRYPT_IN_PLACE 17 /* Do decryption in place */ > #define NETFS_RREQ_PUT_RMW_TAIL 18 /* Need to put ->rmw_tail */ > +#define NETFS_RREQ_RMW 19 /* Performing RMW cycle */ > +#define NETFS_RREQ_REPEAT_RMW 20 /* Need to perform an RMW cycle */ > +#define NETFS_RREQ_NO_READ_HOLE 21 /* Give short read/error if hole encountered */ > #define NETFS_RREQ_USE_PGPRIV2 31 /* [DEPRECATED] Use PG_private_2 to mark > * write to cache on read */ > const struct netfs_request_ops *netfs_ops; > @@ -336,6 +340,7 @@ struct netfs_request_ops { > /* Modification handling */ > void (*update_i_size)(struct inode *inode, loff_t i_size); > void (*post_modify)(struct inode *inode, void *fs_priv); > + void (*rmw_read_done)(struct netfs_io_request *wreq, struct netfs_io_request *rreq); > > /* Write request handling */ > void (*begin_writeback)(struct netfs_io_request *wreq); > @@ -432,6 +437,8 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i > ssize_t netfs_unbuffered_read_iter(struct kiocb *iocb, struct iov_iter *iter); > ssize_t netfs_buffered_read_iter(struct kiocb *iocb, struct iov_iter *iter); > ssize_t netfs_file_read_iter(struct kiocb *iocb, struct iov_iter *iter); > +ssize_t netfs_unbuffered_read_from_inode(struct inode *inode, loff_t pos, > + struct iov_iter *iter, bool nohole); > > /* High-level write API */ > ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, > diff --git a/include/trace/events/netfs.h b/include/trace/events/netfs.h > index 74af82d773bd..9254c6f0e604 100644 > --- a/include/trace/events/netfs.h > +++ b/include/trace/events/netfs.h > @@ -23,6 +23,7 @@ > EM(netfs_read_trace_read_gaps, "READ-GAPS") \ > EM(netfs_read_trace_read_single, "READ-SNGL") \ > EM(netfs_read_trace_prefetch_for_write, "PREFETCHW") \ > + EM(netfs_read_trace_unbuffered_read_from_inode, "READ-INOD") \ > E_(netfs_read_trace_write_begin, "WRITEBEGN") > > #define netfs_write_traces \ > @@ -38,6 +39,7 @@ > EM(NETFS_READ_GAPS, "RG") \ > EM(NETFS_READ_SINGLE, "R1") \ > EM(NETFS_READ_FOR_WRITE, "RW") \ > + EM(NETFS_UNBUFFERED_READ, "UR") \ > EM(NETFS_DIO_READ, "DR") \ > EM(NETFS_WRITEBACK, "WB") \ > EM(NETFS_WRITEBACK_SINGLE, "W1") \ > @@ -104,6 +106,7 @@ > EM(netfs_sreq_trace_io_progress, "IO ") \ > EM(netfs_sreq_trace_limited, "LIMIT") \ > EM(netfs_sreq_trace_need_clear, "N-CLR") \ > + EM(netfs_sreq_trace_need_rmw, "N-RMW") \ > EM(netfs_sreq_trace_partial_read, "PARTR") \ > EM(netfs_sreq_trace_need_retry, "ND-RT") \ > EM(netfs_sreq_trace_pending, "PEND ") \ > >