> On Feb 20, 2020, at 1:28 PM, Anna Schumaker <schumaker.anna@xxxxxxxxx> wrote: > > On Thu, 2020-02-20 at 09:55 -0500, Chuck Lever wrote: >>> On Feb 20, 2020, at 9:42 AM, Anna Schumaker <schumaker.anna@xxxxxxxxx> >>> wrote: >>> >>> On Fri, 2020-02-14 at 17:28 -0500, Chuck Lever wrote: >>>>> On Feb 14, 2020, at 4:12 PM, schumaker.anna@xxxxxxxxx wrote: >>>>> >>>>> From: Anna Schumaker <Anna.Schumaker@xxxxxxxxxx> >>>>> >>>>> This patch adds client support for decoding a single NFS4_CONTENT_DATA >>>>> segment returned by the server. This is the simplest implementation >>>>> possible, since it does not account for any hole segments in the reply. >>>>> >>>>> Signed-off-by: Anna Schumaker <Anna.Schumaker@xxxxxxxxxx> >>>>> --- >>>>> fs/nfs/nfs42xdr.c | 138 ++++++++++++++++++++++++++++++++++++++ >>>>> fs/nfs/nfs4proc.c | 43 +++++++++++- >>>>> fs/nfs/nfs4xdr.c | 1 + >>>>> include/linux/nfs4.h | 2 +- >>>>> include/linux/nfs_fs_sb.h | 1 + >>>>> include/linux/nfs_xdr.h | 2 +- >>>>> 6 files changed, 182 insertions(+), 5 deletions(-) >>>>> >>>>> diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c >>>>> index c03f3246d6c5..bf118ecabe2c 100644 >>>>> --- a/fs/nfs/nfs42xdr.c >>>>> +++ b/fs/nfs/nfs42xdr.c >>>>> @@ -45,6 +45,15 @@ >>>>> #define encode_deallocate_maxsz (op_encode_hdr_maxsz + \ >>>>> encode_fallocate_maxsz) >>>>> #define decode_deallocate_maxsz (op_decode_hdr_maxsz) >>>>> +#define encode_read_plus_maxsz (op_encode_hdr_maxsz + \ >>>>> + encode_stateid_maxsz + 3) >>>>> +#define NFS42_READ_PLUS_SEGMENT_SIZE (1 /* data_content4 */ + \ >>>>> + 2 /* data_info4.di_offset */ + >>>>> \ >>>>> + 2 /* data_info4.di_length */) >>>>> +#define decode_read_plus_maxsz (op_decode_hdr_maxsz + \ >>>>> + 1 /* rpr_eof */ + \ >>>>> + 1 /* rpr_contents count */ + \ >>>>> + NFS42_READ_PLUS_SEGMENT_SIZE) >>>>> #define encode_seek_maxsz (op_encode_hdr_maxsz + \ >>>>> encode_stateid_maxsz + \ >>>>> 2 /* offset */ + \ >>>>> @@ -128,6 +137,14 @@ >>>>> decode_putfh_maxsz + \ >>>>> decode_deallocate_maxsz + \ >>>>> decode_getattr_maxsz) >>>>> +#define NFS4_enc_read_plus_sz (compound_encode_hdr_maxsz + \ >>>>> + encode_sequence_maxsz + \ >>>>> + encode_putfh_maxsz + \ >>>>> + encode_read_plus_maxsz) >>>>> +#define NFS4_dec_read_plus_sz (compound_decode_hdr_maxsz + \ >>>>> + decode_sequence_maxsz + \ >>>>> + decode_putfh_maxsz + \ >>>>> + decode_read_plus_maxsz) >>>>> #define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \ >>>>> encode_sequence_maxsz + \ >>>>> encode_putfh_maxsz + \ >>>>> @@ -252,6 +269,16 @@ static void encode_deallocate(struct xdr_stream >>>>> *xdr, >>>>> encode_fallocate(xdr, args); >>>>> } >>>>> >>>>> +static void encode_read_plus(struct xdr_stream *xdr, >>>>> + const struct nfs_pgio_args *args, >>>>> + struct compound_hdr *hdr) >>>>> +{ >>>>> + encode_op_hdr(xdr, OP_READ_PLUS, decode_read_plus_maxsz, hdr); >>>>> + encode_nfs4_stateid(xdr, &args->stateid); >>>>> + encode_uint64(xdr, args->offset); >>>>> + encode_uint32(xdr, args->count); >>>>> +} >>>>> + >>>>> static void encode_seek(struct xdr_stream *xdr, >>>>> const struct nfs42_seek_args *args, >>>>> struct compound_hdr *hdr) >>>>> @@ -446,6 +473,29 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst >>>>> *req, >>>>> encode_nops(&hdr); >>>>> } >>>>> >>>>> +/* >>>>> + * Encode READ_PLUS request >>>>> + */ >>>>> +static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req, >>>>> + struct xdr_stream *xdr, >>>>> + const void *data) >>>>> +{ >>>>> + const struct nfs_pgio_args *args = data; >>>>> + struct compound_hdr hdr = { >>>>> + .minorversion = nfs4_xdr_minorversion(&args->seq_args), >>>>> + }; >>>>> + >>>>> + encode_compound_hdr(xdr, req, &hdr); >>>>> + encode_sequence(xdr, &args->seq_args, &hdr); >>>>> + encode_putfh(xdr, args->fh, &hdr); >>>>> + encode_read_plus(xdr, args, &hdr); >>>>> + >>>>> + rpc_prepare_reply_pages(req, args->pages, args->pgbase, >>>>> + args->count, hdr.replen); >>>>> + req->rq_rcv_buf.flags |= XDRBUF_READ; >>>> >>>> IMO this line is incorrect. >>> >>> You're right, this line causes problems for RDMA with READ_PLUS. I added it >>> to >>> match how the other xdr read encoders were set up >> >> Ja, I think just removing that line should be sufficient. >> Better would be replacing it with a comment explaining >> why this encoder does not set XDRBUF_READ. :-) >> >> >>>> RFC 8267 Section 6.1 does not list any part of the result of READ_PLUS >>>> as DDP-eligible. There's no way for a client to know how to set up >>>> Write chunks, unless it knows exactly where the file's holes are in >>>> advance. Even then... racy. >>>> >>>> Just curious, have you tried READ_PLUS with proto=rdma ? >>> >>> I haven't done in-depth performance testing, but I have been able to run it. >> >> We should figure out whether that will have a regressive >> impact on NFS/RDMA workloads. I expect that it will, but >> the client can always set up the Reply chunk so that the >> READ payload fits precisely in an RDMA segment that lines >> up with page cache pages. That mitigates some impact. >> >> If your patch set already changes NFSv4.2 mounts to always >> use READ_PLUS in place of READ, it might be prudent for the >> "proto=rdma" mount option to also set "noreadplus", at least >> for the time being. > > I can make this change. > >> >> The down-side here is that would make NFSv4.2 on RDMA >> unable to recognize holes in files the same way as it >> does on TCP, and that's a pretty significant variation >> in behavior. Does "noreadplus" even deal with that? > > Setting "noreadplus" just causes the client to use the READ operation instead, > so there should be no difference between v4.1 and v4.2 if the option is set. My concern is the difference between NFSv4.2 with noreadplus and NFSv4.2 with readplus. The former is not able to detect holes in files on the server, but the latter is. Is that worth mentioning in the man page, or in release notes when NFSv4.2 becomes the default? > Anna > >> >> >>> Anna >>> >>>> >>>>> + encode_nops(&hdr); >>>>> +} >>>>> + >>>>> /* >>>>> * Encode SEEK request >>>>> */ >>>>> @@ -694,6 +744,67 @@ static int decode_deallocate(struct xdr_stream >>>>> *xdr, >>>>> struct nfs42_falloc_res *re >>>>> return decode_op_hdr(xdr, OP_DEALLOCATE); >>>>> } >>>>> >>>>> +static uint32_t decode_read_plus_data(struct xdr_stream *xdr, struct >>>>> nfs_pgio_res *res, >>>>> + uint32_t *eof) >>>>> +{ >>>>> + __be32 *p; >>>>> + uint32_t count, recvd; >>>>> + uint64_t offset; >>>>> + >>>>> + p = xdr_inline_decode(xdr, 8 + 4); >>>>> + if (unlikely(!p)) >>>>> + return -EIO; >>>>> + >>>>> + p = xdr_decode_hyper(p, &offset); >>>>> + count = be32_to_cpup(p); >>>>> + if (count == 0) >>>>> + return 0; >>>>> + >>>>> + recvd = xdr_read_pages(xdr, count); >>>>> + if (count > recvd) { >>>>> + dprintk("NFS: server cheating in read reply: " >>>>> + "count %u > recvd %u\n", count, recvd); >>>>> + count = recvd; >>>>> + *eof = 0; >>>>> + } >>>>> + >>>>> + return count; >>>>> +} >>>>> + >>>>> +static int decode_read_plus(struct xdr_stream *xdr, struct nfs_pgio_res >>>>> *res) >>>>> +{ >>>>> + __be32 *p; >>>>> + uint32_t count, eof, segments, type; >>>>> + int status; >>>>> + >>>>> + status = decode_op_hdr(xdr, OP_READ_PLUS); >>>>> + if (status) >>>>> + return status; >>>>> + >>>>> + p = xdr_inline_decode(xdr, 4 + 4); >>>>> + if (unlikely(!p)) >>>>> + return -EIO; >>>>> + >>>>> + eof = be32_to_cpup(p++); >>>>> + segments = be32_to_cpup(p++); >>>>> + if (segments == 0) >>>>> + return 0; >>>>> + >>>>> + p = xdr_inline_decode(xdr, 4); >>>>> + if (unlikely(!p)) >>>>> + return -EIO; >>>>> + >>>>> + type = be32_to_cpup(p++); >>>>> + if (type == NFS4_CONTENT_DATA) >>>>> + count = decode_read_plus_data(xdr, res, &eof); >>>>> + else >>>>> + return -EINVAL; >>>>> + >>>>> + res->eof = eof; >>>>> + res->count = count; >>>>> + return 0; >>>>> +} >>>>> + >>>>> static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res >>>>> *res) >>>>> { >>>>> int status; >>>>> @@ -870,6 +981,33 @@ static int nfs4_xdr_dec_deallocate(struct rpc_rqst >>>>> *rqstp, >>>>> return status; >>>>> } >>>>> >>>>> +/* >>>>> + * Decode READ_PLUS request >>>>> + */ >>>>> +static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp, >>>>> + struct xdr_stream *xdr, >>>>> + void *data) >>>>> +{ >>>>> + struct nfs_pgio_res *res = data; >>>>> + struct compound_hdr hdr; >>>>> + int status; >>>>> + >>>>> + status = decode_compound_hdr(xdr, &hdr); >>>>> + if (status) >>>>> + goto out; >>>>> + status = decode_sequence(xdr, &res->seq_res, rqstp); >>>>> + if (status) >>>>> + goto out; >>>>> + status = decode_putfh(xdr); >>>>> + if (status) >>>>> + goto out; >>>>> + status = decode_read_plus(xdr, res); >>>>> + if (!status) >>>>> + status = res->count; >>>>> +out: >>>>> + return status; >>>>> +} >>>>> + >>>>> /* >>>>> * Decode SEEK request >>>>> */ >>>>> diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c >>>>> index 95d07a3dc5d1..ed3ec8c36273 100644 >>>>> --- a/fs/nfs/nfs4proc.c >>>>> +++ b/fs/nfs/nfs4proc.c >>>>> @@ -69,6 +69,10 @@ >>>>> >>>>> #include "nfs4trace.h" >>>>> >>>>> +#ifdef CONFIG_NFS_V4_2 >>>>> +#include "nfs42.h" >>>>> +#endif /* CONFIG_NFS_V4_2 */ >>>>> + >>>>> #define NFSDBG_FACILITY NFSDBG_PROC >>>>> >>>>> #define NFS4_BITMASK_SZ 3 >>>>> @@ -5199,28 +5203,60 @@ static bool nfs4_read_stateid_changed(struct >>>>> rpc_task *task, >>>>> return true; >>>>> } >>>>> >>>>> +static bool nfs4_read_plus_not_supported(struct rpc_task *task, >>>>> + struct nfs_pgio_header *hdr) >>>>> +{ >>>>> + struct nfs_server *server = NFS_SERVER(hdr->inode); >>>>> + struct rpc_message *msg = &task->tk_msg; >>>>> + >>>>> + if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS] >>>>> && >>>>> + server->caps & NFS_CAP_READ_PLUS && task->tk_status == >>>>> -ENOTSUPP) { >>>>> + server->caps &= ~NFS_CAP_READ_PLUS; >>>>> + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; >>>>> + rpc_restart_call_prepare(task); >>>>> + return true; >>>>> + } >>>>> + return false; >>>>> +} >>>>> + >>>>> static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header >>>>> *hdr) >>>>> { >>>>> - >>>>> dprintk("--> %s\n", __func__); >>>>> >>>>> if (!nfs4_sequence_done(task, &hdr->res.seq_res)) >>>>> return -EAGAIN; >>>>> if (nfs4_read_stateid_changed(task, &hdr->args)) >>>>> return -EAGAIN; >>>>> + if (nfs4_read_plus_not_supported(task, hdr)) >>>>> + return -EAGAIN; >>>>> if (task->tk_status > 0) >>>>> nfs_invalidate_atime(hdr->inode); >>>>> return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) : >>>>> nfs4_read_done_cb(task, hdr); >>>>> } >>>>> >>>>> +#ifdef CONFIG_NFS_V4_2 >>>>> +static void nfs42_read_plus_support(struct nfs_server *server, struct >>>>> rpc_message *msg) >>>>> +{ >>>>> + if (server->caps & NFS_CAP_READ_PLUS) >>>>> + msg->rpc_proc = >>>>> &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS]; >>>>> + else >>>>> + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; >>>>> +} >>>>> +#else >>>>> +static void nfs42_read_plus_support(struct nfs_server *server, struct >>>>> rpc_message *msg) >>>>> +{ >>>>> + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; >>>>> +} >>>>> +#endif /* CONFIG_NFS_V4_2 */ >>>>> + >>>>> static void nfs4_proc_read_setup(struct nfs_pgio_header *hdr, >>>>> struct rpc_message *msg) >>>>> { >>>>> hdr->timestamp = jiffies; >>>>> if (!hdr->pgio_done_cb) >>>>> hdr->pgio_done_cb = nfs4_read_done_cb; >>>>> - msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; >>>>> + nfs42_read_plus_support(NFS_SERVER(hdr->inode), msg); >>>>> nfs4_init_sequence(&hdr->args.seq_args, &hdr->res.seq_res, 0, >>>>> 0); >>>>> } >>>>> >>>>> @@ -9970,7 +10006,8 @@ static const struct nfs4_minor_version_ops >>>>> nfs_v4_2_minor_ops = { >>>>> | NFS_CAP_SEEK >>>>> | NFS_CAP_LAYOUTSTATS >>>>> | NFS_CAP_CLONE >>>>> - | NFS_CAP_LAYOUTERROR, >>>>> + | NFS_CAP_LAYOUTERROR >>>>> + | NFS_CAP_READ_PLUS, >>>>> .init_client = nfs41_init_client, >>>>> .shutdown_client = nfs41_shutdown_client, >>>>> .match_stateid = nfs41_match_stateid, >>>>> diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c >>>>> index 47817ef0aadb..68b2917d0537 100644 >>>>> --- a/fs/nfs/nfs4xdr.c >>>>> +++ b/fs/nfs/nfs4xdr.c >>>>> @@ -7584,6 +7584,7 @@ const struct rpc_procinfo nfs4_procedures[] = { >>>>> PROC42(COPY_NOTIFY, enc_copy_notify, dec_copy_notify) >>>>> , >>>>> PROC(LOOKUPP, enc_lookupp, dec_lookupp), >>>>> PROC42(LAYOUTERROR, enc_layouterror, dec_layouterror) >>>>> , >>>>> + PROC42(READ_PLUS, enc_read_plus, dec_read_plus), >>>>> }; >>>>> >>>>> static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)]; >>>>> diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h >>>>> index 82d8fb422092..c1eeef52545c 100644 >>>>> --- a/include/linux/nfs4.h >>>>> +++ b/include/linux/nfs4.h >>>>> @@ -540,8 +540,8 @@ enum { >>>>> >>>>> NFSPROC4_CLNT_LOOKUPP, >>>>> NFSPROC4_CLNT_LAYOUTERROR, >>>>> - >>>>> NFSPROC4_CLNT_COPY_NOTIFY, >>>>> + NFSPROC4_CLNT_READ_PLUS, >>>>> }; >>>>> >>>>> /* nfs41 types */ >>>>> diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h >>>>> index 465fa98258a3..11248c5a7b24 100644 >>>>> --- a/include/linux/nfs_fs_sb.h >>>>> +++ b/include/linux/nfs_fs_sb.h >>>>> @@ -281,5 +281,6 @@ struct nfs_server { >>>>> #define NFS_CAP_OFFLOAD_CANCEL (1U << 25) >>>>> #define NFS_CAP_LAYOUTERROR (1U << 26) >>>>> #define NFS_CAP_COPY_NOTIFY (1U << 27) >>>>> +#define NFS_CAP_READ_PLUS (1U << 28) >>>>> >>>>> #endif >>>>> diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h >>>>> index 94c77ed55ce1..8efbf3d8b263 100644 >>>>> --- a/include/linux/nfs_xdr.h >>>>> +++ b/include/linux/nfs_xdr.h >>>>> @@ -655,7 +655,7 @@ struct nfs_pgio_args { >>>>> struct nfs_pgio_res { >>>>> struct nfs4_sequence_res seq_res; >>>>> struct nfs_fattr * fattr; >>>>> - __u32 count; >>>>> + __u64 count; >>>>> __u32 op_status; >>>>> union { >>>>> struct { >>>>> -- >>>>> 2.25.0 >>>>> >>>> >>>> -- >>>> Chuck Lever >> >> -- >> Chuck Lever -- Chuck Lever