On Wed, Aug 14, 2024 at 09:38:39PM +0100, David Howells wrote: > Improve the efficiency of buffered reads in a number of ways: > > (1) Overhaul the algorithm in general so that it's a lot more compact and > split the read submission code between buffered and unbuffered > versions. The unbuffered version can be vastly simplified. > > (2) Read-result collection is handed off to a work queue rather than being > done in the I/O thread. Multiple subrequests can be processes > simultaneously. > > (3) When a subrequest is collected, any folios it fully spans are > collected and "spare" data on either side is donated to either the > previous or the next subrequest in the sequence. > > Notes: > > (*) Readahead expansion is massively slows down fio, presumably because it > causes a load of extra allocations, both folio and xarray, up front > before RPC requests can be transmitted. > > (*) RDMA with cifs does appear to work, both with SIW and RXE. > > (*) PG_private_2-based reading and copy-to-cache is split out into its own > file and altered to use folio_queue. Note that the copy to the cache > now creates a new write transaction against the cache and adds the > folios to be copied into it. This allows it to use part of the > writeback I/O code. > > Signed-off-by: David Howells <dhowells@xxxxxxxxxx> ... > diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c ... > @@ -334,9 +344,8 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) > struct ceph_client *cl = fsc->client; > struct ceph_osd_request *req = NULL; > struct ceph_vino vino = ceph_vino(inode); > - struct iov_iter iter; > - int err = 0; > - u64 len = subreq->len; > + int err; Hi David, err is set conditionally in various places in this function, and then read unconditionally near the end of this function. With this change isn't entirely clear that err is always initialised by the end of the function. Flagged by Smatch. > + u64 len; > bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD); > u64 off = subreq->start; > int extent_cnt; ... > @@ -410,17 +423,19 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq) > req->r_inode = inode; > ihold(inode); > > + trace_netfs_sreq(subreq, netfs_sreq_trace_submit); > ceph_osdc_start_request(req->r_osdc, req); > out: > ceph_osdc_put_request(req); > if (err) > - netfs_subreq_terminated(subreq, err, false); > + netfs_read_subreq_terminated(subreq, err, false); > doutc(cl, "%llx.%llx result %d\n", ceph_vinop(inode), err); > } ... > diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c ... > +/* > + * Go through the list of failed/short reads, retrying all retryable ones. We > + * need to switch failed cache reads to network downloads. > + */ > +static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) > +{ > + struct netfs_io_subrequest *subreq; > + struct netfs_io_stream *stream0 = &rreq->io_streams[0]; > + LIST_HEAD(sublist); > + LIST_HEAD(queue); > + > + _enter("R=%x", rreq->debug_id); > + > + if (list_empty(&rreq->subrequests)) > + return; > + > + if (rreq->netfs_ops->retry_request) > + rreq->netfs_ops->retry_request(rreq, NULL); > + > + /* If there's no renegotiation to do, just resend each retryable subreq > + * up to the first permanently failed one. > + */ > + if (!rreq->netfs_ops->prepare_read && > + !test_bit(NETFS_RREQ_COPY_TO_CACHE, &rreq->flags)) { > + struct netfs_io_subrequest *subreq; > + > + list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { > + if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) > + break; > + if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) { > + netfs_reset_iter(subreq); > + netfs_reissue_read(rreq, subreq); > + } > + } > + return; > + } > + > + /* Okay, we need to renegotiate all the download requests and flip any > + * failed cache reads over to being download requests and negotiate > + * those also. All fully successful subreqs have been removed from the > + * list and any spare data from those has been donated. > + * > + * What we do is decant the list and rebuild it one subreq at a time so > + * that we don't end up with donations jumping over a gap we're busy > + * populating with smaller subrequests. In the event that the subreq > + * we just launched finishes before we insert the next subreq, it'll > + * fill in rreq->prev_donated instead. > + > + * Note: Alternatively, we could split the tail subrequest right before > + * we reissue it and fix up the donations under lock. > + */ > + list_splice_init(&rreq->subrequests, &queue); > + > + do { > + struct netfs_io_subrequest *from; > + struct iov_iter source; > + unsigned long long start, len; > + size_t part, deferred_next_donated = 0; > + bool boundary = false; > + > + /* Go through the subreqs and find the next span of contiguous > + * buffer that we then rejig (cifs, for example, needs the > + * rsize renegotiating) and reissue. > + */ > + from = list_first_entry(&queue, struct netfs_io_subrequest, rreq_link); > + list_move_tail(&from->rreq_link, &sublist); > + start = from->start + from->transferred; > + len = from->len - from->transferred; > + > + _debug("from R=%08x[%x] s=%llx ctl=%zx/%zx/%zx", > + rreq->debug_id, from->debug_index, > + from->start, from->consumed, from->transferred, from->len); > + > + if (test_bit(NETFS_SREQ_FAILED, &from->flags) || > + !test_bit(NETFS_SREQ_NEED_RETRY, &from->flags)) > + goto abandon; > + > + deferred_next_donated = from->next_donated; > + while ((subreq = list_first_entry_or_null( > + &queue, struct netfs_io_subrequest, rreq_link))) { > + if (subreq->start != start + len || > + subreq->transferred > 0 || > + !test_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) > + break; > + list_move_tail(&subreq->rreq_link, &sublist); > + len += subreq->len; > + deferred_next_donated = subreq->next_donated; > + if (test_bit(NETFS_SREQ_BOUNDARY, &subreq->flags)) > + break; > + } > + > + _debug(" - range: %llx-%llx %llx", start, start + len - 1, len); > + > + /* Determine the set of buffers we're going to use. Each > + * subreq gets a subset of a single overall contiguous buffer. > + */ > + netfs_reset_iter(from); > + source = from->io_iter; > + source.count = len; > + > + /* Work through the sublist. */ > + while ((subreq = list_first_entry_or_null( > + &sublist, struct netfs_io_subrequest, rreq_link))) { > + list_del(&subreq->rreq_link); > + > + subreq->source = NETFS_DOWNLOAD_FROM_SERVER; > + subreq->start = start - subreq->transferred; > + subreq->len = len + subreq->transferred; > + stream0->sreq_max_len = subreq->len; > + > + __clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); > + __set_bit(NETFS_SREQ_RETRYING, &subreq->flags); > + > + spin_lock_bh(&rreq->lock); > + list_add_tail(&subreq->rreq_link, &rreq->subrequests); > + subreq->prev_donated += rreq->prev_donated; > + rreq->prev_donated = 0; > + trace_netfs_sreq(subreq, netfs_sreq_trace_retry); > + spin_unlock_bh(&rreq->lock); > + > + BUG_ON(!len); > + > + /* Renegotiate max_len (rsize) */ > + if (rreq->netfs_ops->prepare_read(subreq) < 0) { Earlier in this function it is assumed that prepare_read may be NULL. Can that also be the case here? Also flagged by Smatch. > + trace_netfs_sreq(subreq, netfs_sreq_trace_reprep_failed); > + __set_bit(NETFS_SREQ_FAILED, &subreq->flags); > + } ...