RE: Trying to reduce NFSv4 timeouts to a few seconds on an established connection

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



> From: Andrew Klaassen <andrew.klaassen@xxxxxxxxxxxxxx>
> Sent: Thursday, February 2, 2023 1:16 PM
> 
> > From: Jeff Layton <jlayton@xxxxxxxxxx>
> > Sent: Monday, January 30, 2023 3:32 PM
> >
> > On Mon, 2023-01-30 at 20:03 +0000, Andrew Klaassen wrote:
> > > > From: Jeff Layton <jlayton@xxxxxxxxxx>
> > > > Sent: Monday, January 30, 2023 2:56 PM
> > > >
> > > > On Mon, 2023-01-30 at 19:33 +0000, Andrew Klaassen wrote:
> > > > > > From: Jeff Layton <jlayton@xxxxxxxxxx>
> > > > > > Sent: Friday, January 27, 2023 8:33 AM
> > > > > >
> > > > > > On Thu, 2023-01-26 at 22:08 +0000, Andrew Klaassen wrote:
> > > > > > > > From: Andrew Klaassen <andrew.klaassen@xxxxxxxxxxxxxx>
> > > > > > > > Sent: Thursday, January 26, 2023 10:32 AM
> > > > > > > >
> > > > > > > > > From: Andrew Klaassen <andrew.klaassen@xxxxxxxxxxxxxx>
> > > > > > > > > Sent: Monday, January 23, 2023 11:31 AM
> > > > > > > > >
> > > > > > > > > Hello,
> > > > > > > > >
> > > > > > > > > There's a specific NFSv4 mount on a specific machine
> > > > > > > > > which we'd like to timeout and return an error after a
> > > > > > > > > few seconds if the server goes away.
> > > > > > > > >
> > > > > > > > > I've confirmed the following on two different kernels,
> > > > > > > > > 4.18.0-
> > > > > > > > > 348.12.2.el8_5.x86_64 and 6.1.7-200.fc37.x86_64.
> > > > > > > > >
> > > > > > > > > I've been able to get both autofs and the mount command
> > > > > > > > > to cooperate, so that the mount attempt fails after an
> > > > > > > > > arbitrary number of seconds.
> > > > > > > > > This mount command, for example, will fail after 6
> > > > > > > > > seconds, as expected based on the
> > > > > > > > > timeo=20,retrans=2,retry=0 options:
> > > > > > > > >
> > > > > > > > > $ time sudo mount -t nfs4 -o
> > > > > > > > > rw,relatime,sync,vers=4.2,rsize=131072,wsize=131072,naml
> > > > > > > > > en
> > > > > > > > > =255
> > > > > > > > > ,acr
> > > > > > > > > egmi
> > > > > > > > > n
> > > > > > > > >
> > > > > > > > =0,acregmax=0,acdirmin=0,acdirmax=0,soft,noac,proto=tcp,ti
> > > > > > > > me
> > > > > > > > o=20
> > > > > > > > ,ret
> > > > > > > > ra
> > > > > > > > > n s=2,retry=0,sec=sys thor04:/mnt/thorfs04  /mnt/thor04
> > > > > > > > > mount.nfs4: Connection timed out
> > > > > > > > >
> > > > > > > > > real    0m6.084s
> > > > > > > > > user    0m0.007s
> > > > > > > > > sys     0m0.015s
> > > > > > > > >
> > > > > > > > > However, if the share is already mounted and the server
> > > > > > > > > goes away, the timeout is always 2 minutes plus the time
> > > > > > > > > I expect based on timeo and retrans.  In this case, 2
> > > > > > > > > minutes and 6
> > > > > > > > > seconds:
> > > > > > > > >
> > > > > > > > > $ time ls /mnt/thor04
> > > > > > > > > ls: cannot access '/mnt/thor04': Connection timed out
> > > > > > > > >
> > > > > > > > > real    2m6.025s
> > > > > > > > > user    0m0.003s
> > > > > > > > > sys     0m0.000s
> > > > > > > > >
> > > > > > > > > Watching the outgoing packets in the second case, the
> > > > > > > > > pattern is always the
> > > > > > > > > same:
> > > > > > > > >  - 0.2 seconds between the first two, then doubling each
> > > > > > > > > time until the two minute mark is exceeded (so the last
> > > > > > > > > NFS packet, which is always the 11th packet, is sent
> > > > > > > > > around 1:45 after the first).
> > > > > > > > >  - Then some generic packets that start exactly-ish on
> > > > > > > > > the two minute mark, 1 second between the first two,
> > > > > > > > > then doubling each time.
> > > > > > > > > (By
> > > > > > > > > this time the NFS command has given up.)
> > > > > > > > >
> > > > > > > > > 11:10:21.898305 IP 10.30.13.2.916 > 10.31.3.13.2049:
> > > > > > > > > Flags [P.], seq 14452:14652, ack 18561, win 501, options
> > > > > > > > > [nop,nop,TS val
> > > > > > > > > 834889483 ecr 1589769203], length 200: NFS request xid
> > > > > > > > > 3614904256
> > > > > > > > > 196 getattr fh
> > > > > > > > > 0,2/53
> > > > > > > > > 11:10:22.105189 IP 10.30.13.2.916 > 10.31.3.13.2049:
> > > > > > > > > Flags [P.], seq 14452:14652, ack 18561, win 501, options
> > > > > > > > > [nop,nop,TS val
> > > > > > > > > 834889690 ecr 1589769203], length 200: NFS request xid
> > > > > > > > > 3614904256
> > > > > > > > > 196 getattr fh
> > > > > > > > > 0,2/53
> > > > > > > > > 11:10:22.313290 IP 10.30.13.2.916 > 10.31.3.13.2049:
> > > > > > > > > Flags [P.], seq 14452:14652, ack 18561, win 501, options
> > > > > > > > > [nop,nop,TS val
> > > > > > > > > 834889898 ecr 1589769203], length 200: NFS request xid
> > > > > > > > > 3614904256
> > > > > > > > > 196 getattr fh
> > > > > > > > > 0,2/53
> > > > > > > > > 11:10:22.721269 IP 10.30.13.2.916 > 10.31.3.13.2049:
> > > > > > > > > Flags [P.], seq 14452:14652, ack 18561, win 501, options
> > > > > > > > > [nop,nop,TS val
> > > > > > > > > 834890306 ecr 1589769203], length 200: NFS request xid
> > > > > > > > > 3614904256
> > > > > > > > > 196 getattr fh
> > > > > > > > > 0,2/53
> > > > > > > > > 11:10:23.569192 IP 10.30.13.2.916 > 10.31.3.13.2049:
> > > > > > > > > Flags [P.], seq 14452:14652, ack 18561, win 501, options
> > > > > > > > > [nop,nop,TS val
> > > > > > > > > 834891154 ecr 1589769203], length 200: NFS request xid
> > > > > > > > > 3614904256
> > > > > > > > > 196 getattr fh
> > > > > > > > > 0,2/53
> > > > > > > > > 11:10:25.233212 IP 10.30.13.2.916 > 10.31.3.13.2049:
> > > > > > > > > Flags [P.], seq 14452:14652, ack 18561, win 501, options
> > > > > > > > > [nop,nop,TS val
> > > > > > > > > 834892818 ecr 1589769203], length 200: NFS request xid
> > > > > > > > > 3614904256
> > > > > > > > > 196 getattr fh
> > > > > > > > > 0,2/53
> > > > > > > > > 11:10:28.497282 IP 10.30.13.2.916 > 10.31.3.13.2049:
> > > > > > > > > Flags [P.], seq 14452:14652, ack 18561, win 501, options
> > > > > > > > > [nop,nop,TS val
> > > > > > > > > 834896082 ecr 1589769203], length 200: NFS request xid
> > > > > > > > > 3614904256
> > > > > > > > > 196 getattr fh
> > > > > > > > > 0,2/53
> > > > > > > > > 11:10:35.025219 IP 10.30.13.2.916 > 10.31.3.13.2049:
> > > > > > > > > Flags [P.], seq 14452:14652, ack 18561, win 501, options
> > > > > > > > > [nop,nop,TS val
> > > > > > > > > 834902610 ecr 1589769203], length 200: NFS request xid
> > > > > > > > > 3614904256
> > > > > > > > > 196 getattr fh
> > > > > > > > > 0,2/53
> > > > > > > > > 11:10:48.337201 IP 10.30.13.2.916 > 10.31.3.13.2049:
> > > > > > > > > Flags [P.], seq 14452:14652, ack 18561, win 501, options
> > > > > > > > > [nop,nop,TS val
> > > > > > > > > 834915922 ecr 1589769203], length 200: NFS request xid
> > > > > > > > > 3614904256
> > > > > > > > > 196 getattr fh
> > > > > > > > > 0,2/53
> > > > > > > > > 11:11:14.449303 IP 10.30.13.2.916 > 10.31.3.13.2049:
> > > > > > > > > Flags [P.], seq 14452:14652, ack 18561, win 501, options
> > > > > > > > > [nop,nop,TS val
> > > > > > > > > 834942034 ecr 1589769203], length 200: NFS request xid
> > > > > > > > > 3614904256
> > > > > > > > > 196 getattr fh
> > > > > > > > > 0,2/53
> > > > > > > > > 11:12:08.721251 IP 10.30.13.2.916 > 10.31.3.13.2049:
> > > > > > > > > Flags [P.], seq 14452:14652, ack 18561, win 501, options
> > > > > > > > > [nop,nop,TS val
> > > > > > > > > 834996306 ecr 1589769203], length 200: NFS request xid
> > > > > > > > > 3614904256
> > > > > > > > > 196 getattr fh
> > > > > > > > > 0,2/53
> > > > > > > > > 11:12:22.545394 IP 10.30.13.2.942 > 10.31.3.13.2049:
> > > > > > > > > Flags [S], seq 1375256951, win 64240, options [mss
> > > > > > > > > 1460,sackOK,TS val
> > > > > > > > > 835010130 ecr 0,nop,wscale 7], length 0
> > > > > > > > > 11:12:23.570199 IP 10.30.13.2.942 > 10.31.3.13.2049:
> > > > > > > > > Flags [S], seq 1375256951, win 64240, options [mss
> > > > > > > > > 1460,sackOK,TS val
> > > > > > > > > 835011155 ecr 0,nop,wscale 7], length 0
> > > > > > > > > 11:12:25.617284 IP 10.30.13.2.942 > 10.31.3.13.2049:
> > > > > > > > > Flags [S], seq 1375256951, win 64240, options [mss
> > > > > > > > > 1460,sackOK,TS val
> > > > > > > > > 835013202 ecr 0,nop,wscale 7], length 0
> > > > > > > > > 11:12:29.649219 IP 10.30.13.2.942 > 10.31.3.13.2049:
> > > > > > > > > Flags [S], seq 1375256951, win 64240, options [mss
> > > > > > > > > 1460,sackOK,TS val
> > > > > > > > > 835017234 ecr 0,nop,wscale 7], length 0
> > > > > > > > > 11:12:37.905274 IP 10.30.13.2.942 > 10.31.3.13.2049:
> > > > > > > > > Flags [S], seq 1375256951, win 64240, options [mss
> > > > > > > > > 1460,sackOK,TS val
> > > > > > > > > 835025490 ecr 0,nop,wscale 7], length 0
> > > > > > > > > 11:12:54.289212 IP 10.30.13.2.942 > 10.31.3.13.2049:
> > > > > > > > > Flags [S], seq 1375256951, win 64240, options [mss
> > > > > > > > > 1460,sackOK,TS val
> > > > > > > > > 835041874 ecr 0,nop,wscale 7], length 0
> > > > > > > > > 11:13:26.545304 IP 10.30.13.2.942 > 10.31.3.13.2049:
> > > > > > > > > Flags [S], seq 1375256951, win 64240, options [mss
> > > > > > > > > 1460,sackOK,TS val
> > > > > > > > > 835074130 ecr 0,nop,wscale 7], length 0
> > > > > > > > >
> > > > > > > > > I tried changing tcp_retries2 as suggested in another
> > > > > > > > > thread from this list:
> > > > > > > > >
> > > > > > > > > # echo 3 > /proc/sys/net/ipv4/tcp_retries2
> > > > > > > > >
> > > > > > > > > ...but it made no difference on either kernel.  The 2
> > > > > > > > > minute timeout also doesn't seem to match with what I'd
> > > > > > > > > calculate from the initial value of tcp_retries2, which
> > > > > > > > > should give a much higher timeout.
> > > > > > > > >
> > > > > > > > > The only clue I've been able to find is in the retry=n
> > > > > > > > > entry in the NFS
> > > > > > > > > manpage:
> > > > > > > > >
> > > > > > > > > " For TCP the default is 3 minutes, but system TCP
> > > > > > > > > connection timeouts will sometimes limit the timeout of
> > > > > > > > > each retransmission to around
> > > > > > > > > 2
> > > > > > > > minutes."
> > > > > > > > >
> > > > > > > > > What I'm not able to make sense of:
> > > > > > > > >  - The retry option says that it applies to mount
> > > > > > > > > operations, not read/write operations.  However, in this
> > > > > > > > > case I'm seeing the 2 minute delay on read/write
> > > > > > > > > operations but *not* mount operations.
> > > > > > > > >  - A couple of hours of searching didn't lead me to any
> > > > > > > > > kernel settings that would result in a 2 minute timeout.
> > > > > > > > >
> > > > > > > > > Does anyone have any clues about a) what's happening and
> > > > > > > > > b)
> > > > > > > > > how to get our desired behaviour of being able to
> > > > > > > > > control both mount and read/write timeouts down to a few
> seconds?
> > > > > > > > >
> > > > > > > > > Thanks.
> > > > > > > >
> > > > > > > > I thought that changing TCP_RTO_MAX in include/net/tcp.h
> > > > > > > > from
> > > > > > > > 120 to
> > > > > > > > something smaller and recompiling the kernel would change
> > > > > > > > the 2 minute timeout, but it had no effect.  I'm going to
> > > > > > > > keep poking through the kernel code to see if there's a
> > > > > > > > knob I can turn to change the 2 minute timeout, so that I
> > > > > > > > can at least understand where it's coming from.
> > > > > > > >
> > > > > > > > Any hints as to where I should be looking?
> > > > > > >
> > > > > > > I believe I've made some progress with this today:
> > > > > > >
> > > > > > >  - Calls to rpc_create() from fs/nfs/client.c are sending an
> > > > > > > rpc_timeout struct with their args.
> > > > > > >  - rpc_create() does *not* pass the timeout on to
> > > > > > > xprt_create_transport(), which then can't pass it on to
> > > > > > > xs_setup_tcp().
> > > > > > >  - xs_setup_tcp(), having no timeout passed to it, uses
> > > > > > > xs_tcp_default_timeout instead.
> > > > > > >  - changing xs_tcp_default_timeout changes the "ls" timeout
> > > > > > > behaviour I described above.
> > > > > > >
> > > > > > > In theory all of this means that the timeout simply needs to
> > > > > > > be passed through and used instead of
> > > > > > > xs_tcp_default_timeout.  I'm going to give this a try
> > > > > > > tomorrow.
> > > > > > >
> > > > > >
> > > > > > That's a great root-cause analysis. The interlocking timeouts
> > > > > > involved with NFS and its sockets can be really difficult to
> > > > > > unwind.
> > > > > >
> > > > > > Is there a way to automate this testcase? That might be nice
> > > > > > to have in xfstests or the nfstest suite.
> > > > > >
> > > > > > > Here's what I'm going to try first; I'm no C programmer,
> > > > > > > though, so any advice or corrections you might have would be
> > > > > > > appreciated.
> > > > > > >
> > > > > > > Thanks.
> > > > > > >
> > > > > > > Andrew
> > > > > > >
> > > > > > > diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index
> > > > > > > 0b0b9f1eed46..1350c1f489f7 100644
> > > > > > > --- a/net/sunrpc/clnt.c
> > > > > > > +++ b/net/sunrpc/clnt.c
> > > > > > > @@ -532,6 +532,7 @@ struct rpc_clnt *rpc_create(struct
> > > > > > > rpc_create_args
> > > > > > > *args)
> > > > > > >                 .addrlen = args->addrsize,
> > > > > > >                 .servername = args->servername,
> > > > > > >                 .bc_xprt = args->bc_xprt,
> > > > > > > +               .timeout = args->timeout,
> > > > > > >         };
> > > > > > >         char servername[48];
> > > > > > >         struct rpc_clnt *clnt; diff --git
> > > > > > > a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index
> > > > > > > aaa5b2741b79..adc79d94b59e 100644
> > > > > > > --- a/net/sunrpc/xprtsock.c
> > > > > > > +++ b/net/sunrpc/xprtsock.c
> > > > > > > @@ -3003,7 +3003,7 @@ static struct rpc_xprt
> > > > > > > *xs_setup_tcp(struct xprt_create *args)
> > > > > > >         xprt->idle_timeout = XS_IDLE_DISC_TO;
> > > > > > >
> > > > > > >         xprt->ops = &xs_tcp_ops;
> > > > > > > -       xprt->timeout = &xs_tcp_default_timeout;
> > > > > > > +       xprt->timeout = args->timeout;
> > > > > > >
> > > > > > >         xprt->max_reconnect_timeout = xprt->timeout-
> > > > > > > >to_maxval;
> > > > > > >         xprt->connect_timeout = xprt->timeout->to_initval *
> > > > > > >
> > > > > >
> > > > > > Looks like you're probably on the right track. You're missing
> > > > > > a few
> > > > > > things:
> > > > > >
> > > > > > You'll need to add a "timeout" field to struct xprt_create in
> > > > > > include/linux/sunrpc/xprt.h, and there may be some other
> > > > > > places that either need to set the timeout in that structure,
> > > > > > or do something with that field when it's set.
> > > > > >
> > > > > > Once you have something that fixes your reproducer, go ahead
> > > > > > and post it and we can help you work through whatever changes
> > > > > > need to me made
> > > > to
> > > > > > make it work.
> > > > > >
> > > > > > Nice work!
> > > > >
> > > > > Thanks for the tip, that was helpful.
> > > > >
> > > > > Currently I'm fighting with kernel recompilation.  I decided to
> > > > > make it quicker by slimming down the config, but apparently I've
> > > > > achieved something which Google claims no one else has achieved:
> > > > >
> > > > > Errors on kernel make modules_install:
> > > > >
> > > > >   DEPMOD  /lib/modules/6.2.0-rc5-sunrpctimeo+
> > > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > > nfs4_disable_idmapping
> > > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > > nfs4_label_alloc
> > > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > > send_implementation_id
> > > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > > nfs_atomic_open
> > > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > > nfs_clear_verifier_delegated
> > > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > > nfs4_client_id_uniquifier
> > > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > > nfs4_dentry_operations
> > > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > > nfs_fscache_open_file
> > > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > nfs4_fs_type
> > > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > > recover_lost_locks
> > > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > > nfs_callback_nr_threads
> > > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > > max_session_cb_slots
> > > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > > max_session_slots
> > > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > > nfs_idmap_cache_timeout
> > > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > > nfs_callback_set_tcpport
> > > > >
> > > > > Errors on module load:
> > > > >
> > > > > [   94.008271] nfsv4: Unknown symbol nfs4_disable_idmapping (err -
> > > > > 2)
> > > > > [   94.008321] nfsv4: Unknown symbol nfs4_label_alloc (err -2)
> > > > > [   94.008434] nfsv4: Unknown symbol send_implementation_id (err -
> > > > > 2)
> > > > > [   94.008446] nfsv4: Unknown symbol nfs_atomic_open (err -2)
> > > > > [   94.008468] nfsv4: Unknown symbol nfs_clear_verifier_delegated
> > > > > (err
> > > > > -2)
> > > > > [   94.008475] nfsv4: Unknown symbol nfs4_client_id_uniquifier
> > > > > (err -
> > > > > 2)
> > > > > [   94.008501] nfsv4: Unknown symbol nfs4_dentry_operations (err -
> > > > > 2)
> > > > > [   94.008521] nfsv4: Unknown symbol nfs_fscache_open_file (err -
> > > > > 2)
> > > > > [   94.008566] nfsv4: Unknown symbol nfs4_fs_type (err -2)
> > > > > [   94.008595] nfsv4: Unknown symbol recover_lost_locks (err -2)
> > > > > [   94.008639] nfsv4: Unknown symbol nfs_callback_nr_threads (err
> > > > > -2)
> > > > > [   94.008654] nfsv4: Unknown symbol max_session_cb_slots (err -2)
> > > > > [   94.008678] nfsv4: Unknown symbol max_session_slots (err -2)
> > > > > [   94.008694] nfsv4: Unknown symbol nfs_idmap_cache_timeout (err
> > > > > -2)
> > > > > [   94.008709] nfsv4: Unknown symbol nfs_callback_set_tcpport (err
> > > > > -2)
> > > > >
> > > > > I suspect I've turned something off in the config that I
> > > > > shouldn't have, but I'm not sure what.  I see that one of the
> > > > > symbols
> > > > > (nfs_clear_verifier_delegated) is in include/linux/nfs_fs.h, and
> > > > > the others are defined in fs/nfs/nfs4_fs.h, fs/nfs/super.c,
> > > > > fs/nfs/dir.c, fs/nfs/inode.c, fs/nfs/fscache.c, and
> > > > > fs/nfs/fs_context.c.  I'm changing config options and
> > > > > recompiling to try to figure out what I'm missing, but at a
> > > > > couple of hours per compile and only a couple of days a week to
> > > > > work on this it's slow going.  Any hints as to what I might be
> > > > > doing wrong would be appreciated.  😊
> > > > >
> > > >
> > > > Looks like the ABI got broken when you turned off some options.
> > > >
> > > > Generally, if you just want to build a single module, then you
> > > > want the .config to be _exactly_ the one that you used to build
> > > > the kernel you're going to plug it into. Then to build the modules
> > > > under fs/nfs you can
> > > > do:
> > > >
> > > >     make modules_prepare
> > > >     make M=fs/nfs
> > > >
> > > > ...and then drop the resulting .ko objects into the right place in
> > > > /lib/modules.
> > > >
> > > > That said, it may be simpler to just build and work with a whole
> > > > kernel for testing purposes. Working with an individual kmod can
> > > > be a bit tricky unless you know what you're doing.
> > > >
> > > > Once you do the first, subsequent builds should be reasonably fast.
> > >
> > > I'm going to go back to a full kernel build with make oldconfig
> > > using the distro's kernel config to try to avoid this latest issue,
> > > then try what you've suggested to speed up recompiles.
> > >
> > > Since my changes are in net/sunrpc, should I be doing something like
> > > this?
> > >
> > > make modules_prepare
> > > make M=net/sunrpc
> > > make M=fs/nfs
> > >
> > > Or do I not need to recompile nfs if I'm only touching the internals
> > > of sunrpc?
> > >
> > > Thanks again.
> > >
> > > Andrew
> > >
> > >
> > >
> >
> > You shouldn't need to build both *UNLESS* you change the ABI. That
> > includes stuff like number of arguments to an exported function, or
> > the size or layout of particular structures or arrays that both
> > modules might be working with, etc...
> >
> > If you do that, then things can break in all sorts of "interesting"
> > ways that can be very hard to track down. Without seeing your patch,
> > it's hard to know whether you're breaking those rules here. YMMV, of
> course.
> >
> > Again, doing a full kernel build is the safest way to avoid that sort
> > of thing. I'd council against shortcuts here unless you know what
> > you're doing. Let the machine do the work. ;)
> 
> In the end I had to do a fresh compile with a different EXTRAVERSION to get
> rid of the "Unknown symbol" errors.  I'm guessing it has something to do
> with leftovers in /boot or /lib/modules from an old build, but I didn't pursue
> it.
> 
> After a few tries, I finally got a version that didn't put random values in the
> timeout or make the kernel blow up.  Now the compiler gives me a warning
> about "assignment discards ‘const’ qualifier from pointer target type", but it
> works.  I'd be happy to get advice on how to make it better.  (Keeping in
> mind that I'm far from a C programmer.  :-) )
> 
> Here's the patch so far:
> 
> diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index
> b9f59aabee53..5b7fdaff9267 100644
> --- a/include/linux/sunrpc/xprt.h
> +++ b/include/linux/sunrpc/xprt.h
> @@ -333,6 +333,7 @@ struct xprt_create {
>         struct svc_xprt         *bc_xprt;       /* NFSv4.1 backchannel */
>         struct rpc_xprt_switch  *bc_xps;
>         unsigned int            flags;
> +       struct rpc_timeout      *timeout;       /* timeout parms */
>  };
> 
>  struct xprt_class {
> diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index
> 0b0b9f1eed46..1350c1f489f7 100644
> --- a/net/sunrpc/clnt.c
> +++ b/net/sunrpc/clnt.c
> @@ -532,6 +532,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args
> *args)
>                 .addrlen = args->addrsize,
>                 .servername = args->servername,
>                 .bc_xprt = args->bc_xprt,
> +               .timeout = args->timeout,
>         };
>         char servername[48];
>         struct rpc_clnt *clnt;
> diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index
> aaa5b2741b79..0011596be4ad 100644
> --- a/net/sunrpc/xprtsock.c
> +++ b/net/sunrpc/xprtsock.c
> @@ -2174,6 +2174,17 @@ static void xs_tcp_set_socket_timeouts(struct
> rpc_xprt *xprt,
>         unsigned int keepcnt;
>         unsigned int timeo;
> 
> +       dprintk("xs_tcp_set_socket_timeouts: xprt->timeout->to_initval:
> %lu\n",
> +               xprt->timeout->to_initval);
> +       dprintk("xs_tcp_set_socket_timeouts: xprt->timeout->to_maxval:
> %lu\n",
> +               xprt->timeout->to_maxval);
> +       dprintk("xs_tcp_set_socket_timeouts: xprt->timeout->to_increment:
> %lu\n",
> +               xprt->timeout->to_increment);
> +       dprintk("xs_tcp_set_socket_timeouts: xprt->timeout->to_retries:
> %u\n",
> +               xprt->timeout->to_retries);
> +       dprintk("xs_tcp_set_socket_timeouts: xprt->timeout->to_exponential:
> %u\n",
> +               xprt->timeout->to_exponential);
> +
>         spin_lock(&xprt->transport_lock);
>         keepidle = DIV_ROUND_UP(xprt->timeout->to_initval, HZ);
>         keepcnt = xprt->timeout->to_retries + 1; @@ -3003,7 +3014,24 @@
> static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
>         xprt->idle_timeout = XS_IDLE_DISC_TO;
> 
>         xprt->ops = &xs_tcp_ops;
> -       xprt->timeout = &xs_tcp_default_timeout;
> +
> +       if (args->timeout) {
> +               xprt->timeout = kmalloc(sizeof(struct rpc_timeout), GFP_KERNEL);
> +               memcpy(xprt->timeout, args->timeout, sizeof(struct rpc_timeout));
> +       } else {
> +               xprt->timeout = &xs_tcp_default_timeout;
> +       }
> +
> +       dprintk("xs_setup_tcp: xprt->timeout->to_initval: %lu\n",
> +               xprt->timeout->to_initval);
> +       dprintk("xs_setup_tcp: xprt->timeout->to_maxval: %lu\n",
> +               xprt->timeout->to_maxval);
> +       dprintk("xs_setup_tcp: xprt->timeout->to_increment: %lu\n",
> +               xprt->timeout->to_increment);
> +       dprintk("xs_setup_tcp: xprt->timeout->to_retries: %u\n",
> +               xprt->timeout->to_retries);
> +       dprintk("xs_setup_tcp: xprt->timeout->to_exponential: %u\n",
> +               xprt->timeout->to_exponential);
>         xprt->max_reconnect_timeout = xprt->timeout->to_maxval;
>         xprt->connect_timeout = xprt->timeout->to_initval *

I have eliminated the warnings about discarding const.  However, it has resulted in a couple of questions that I hope someone can answer:

 - I'm allocating memory.  I assume that means I should free it somewhere.  But where?  In xprt_destroy(), which appears to do cleanup?  Or in xprt_destroy_cb(), which is called from xprt_destroy() and which frees xprt->servername?  Or somewhere else completely?
 - If I free the allocated memory, will that cause any problems in the cases where no timeout is passed in via the args and the static const struct xs_tcp_default_timeout is assigned to xprt->timeout?
 - If freeing the static const struct default will cause a problem, what should I do instead?  Allocate and memcpy even when assigning the default?  And would that mean doing the same thing for all the other transports that are setting timeouts (local, udp, tcp, and bc_tcp)?

Thanks for anyone who has advice.

Latest diff below.  Thanks.

Andrew


diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index b9f59aabee53..0b30db910af3 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -333,6 +333,7 @@ struct xprt_create {
        struct svc_xprt         *bc_xprt;       /* NFSv4.1 backchannel */
        struct rpc_xprt_switch  *bc_xps;
        unsigned int            flags;
+       const struct rpc_timeout *timeout;      /* timeout parms */
 };

 struct xprt_class {
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 0b0b9f1eed46..1350c1f489f7 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -532,6 +532,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
                .addrlen = args->addrsize,
                .servername = args->servername,
                .bc_xprt = args->bc_xprt,
+               .timeout = args->timeout,
        };
        char servername[48];
        struct rpc_clnt *clnt;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index aaa5b2741b79..457b9e0d72ba 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2983,6 +2983,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
        struct rpc_xprt *xprt;
        struct sock_xprt *transport;
        struct rpc_xprt *ret;
+       struct rpc_timeout *timeout;
        unsigned int max_slot_table_size = xprt_max_tcp_slot_table_entries;

        if (args->flags & XPRT_CREATE_INFINITE_SLOTS)
@@ -3003,7 +3004,14 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
        xprt->idle_timeout = XS_IDLE_DISC_TO;

        xprt->ops = &xs_tcp_ops;
-       xprt->timeout = &xs_tcp_default_timeout;
+
+       if (args->timeout) {
+               timeout = kmalloc(sizeof(struct rpc_timeout), GFP_KERNEL);
+               memcpy(timeout, args->timeout, sizeof(struct rpc_timeout));
+               xprt->timeout = timeout;
+       } else {
+               xprt->timeout = &xs_tcp_default_timeout;
+       }

        xprt->max_reconnect_timeout = xprt->timeout->to_maxval;
        xprt->connect_timeout = xprt->timeout->to_initval *




 




[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux