RE: Trying to reduce NFSv4 timeouts to a few seconds on an established connection

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



> From: Jeff Layton <jlayton@xxxxxxxxxx>
> Sent: Monday, January 30, 2023 3:32 PM
> 
> On Mon, 2023-01-30 at 20:03 +0000, Andrew Klaassen wrote:
> > > From: Jeff Layton <jlayton@xxxxxxxxxx>
> > > Sent: Monday, January 30, 2023 2:56 PM
> > >
> > > On Mon, 2023-01-30 at 19:33 +0000, Andrew Klaassen wrote:
> > > > > From: Jeff Layton <jlayton@xxxxxxxxxx>
> > > > > Sent: Friday, January 27, 2023 8:33 AM
> > > > >
> > > > > On Thu, 2023-01-26 at 22:08 +0000, Andrew Klaassen wrote:
> > > > > > > From: Andrew Klaassen <andrew.klaassen@xxxxxxxxxxxxxx>
> > > > > > > Sent: Thursday, January 26, 2023 10:32 AM
> > > > > > >
> > > > > > > > From: Andrew Klaassen <andrew.klaassen@xxxxxxxxxxxxxx>
> > > > > > > > Sent: Monday, January 23, 2023 11:31 AM
> > > > > > > >
> > > > > > > > Hello,
> > > > > > > >
> > > > > > > > There's a specific NFSv4 mount on a specific machine which
> > > > > > > > we'd like to timeout and return an error after a few
> > > > > > > > seconds if the server goes away.
> > > > > > > >
> > > > > > > > I've confirmed the following on two different kernels,
> > > > > > > > 4.18.0-
> > > > > > > > 348.12.2.el8_5.x86_64 and 6.1.7-200.fc37.x86_64.
> > > > > > > >
> > > > > > > > I've been able to get both autofs and the mount command to
> > > > > > > > cooperate, so that the mount attempt fails after an
> > > > > > > > arbitrary number of seconds.
> > > > > > > > This mount command, for example, will fail after 6
> > > > > > > > seconds, as expected based on the
> > > > > > > > timeo=20,retrans=2,retry=0 options:
> > > > > > > >
> > > > > > > > $ time sudo mount -t nfs4 -o
> > > > > > > > rw,relatime,sync,vers=4.2,rsize=131072,wsize=131072,namlen
> > > > > > > > =255
> > > > > > > > ,acr
> > > > > > > > egmi
> > > > > > > > n
> > > > > > > >
> > > > > > > =0,acregmax=0,acdirmin=0,acdirmax=0,soft,noac,proto=tcp,time
> > > > > > > o=20
> > > > > > > ,ret
> > > > > > > ra
> > > > > > > > n s=2,retry=0,sec=sys thor04:/mnt/thorfs04  /mnt/thor04
> > > > > > > > mount.nfs4: Connection timed out
> > > > > > > >
> > > > > > > > real    0m6.084s
> > > > > > > > user    0m0.007s
> > > > > > > > sys     0m0.015s
> > > > > > > >
> > > > > > > > However, if the share is already mounted and the server
> > > > > > > > goes away, the timeout is always 2 minutes plus the time I
> > > > > > > > expect based on timeo and retrans.  In this case, 2
> > > > > > > > minutes and 6
> > > > > > > > seconds:
> > > > > > > >
> > > > > > > > $ time ls /mnt/thor04
> > > > > > > > ls: cannot access '/mnt/thor04': Connection timed out
> > > > > > > >
> > > > > > > > real    2m6.025s
> > > > > > > > user    0m0.003s
> > > > > > > > sys     0m0.000s
> > > > > > > >
> > > > > > > > Watching the outgoing packets in the second case, the
> > > > > > > > pattern is always the
> > > > > > > > same:
> > > > > > > >  - 0.2 seconds between the first two, then doubling each
> > > > > > > > time until the two minute mark is exceeded (so the last
> > > > > > > > NFS packet, which is always the 11th packet, is sent
> > > > > > > > around 1:45 after the first).
> > > > > > > >  - Then some generic packets that start exactly-ish on the
> > > > > > > > two minute mark, 1 second between the first two, then
> > > > > > > > doubling each time.
> > > > > > > > (By
> > > > > > > > this time the NFS command has given up.)
> > > > > > > >
> > > > > > > > 11:10:21.898305 IP 10.30.13.2.916 > 10.31.3.13.2049: Flags
> > > > > > > > [P.], seq 14452:14652, ack 18561, win 501, options
> > > > > > > > [nop,nop,TS val
> > > > > > > > 834889483 ecr 1589769203], length 200: NFS request xid
> > > > > > > > 3614904256
> > > > > > > > 196 getattr fh
> > > > > > > > 0,2/53
> > > > > > > > 11:10:22.105189 IP 10.30.13.2.916 > 10.31.3.13.2049: Flags
> > > > > > > > [P.], seq 14452:14652, ack 18561, win 501, options
> > > > > > > > [nop,nop,TS val
> > > > > > > > 834889690 ecr 1589769203], length 200: NFS request xid
> > > > > > > > 3614904256
> > > > > > > > 196 getattr fh
> > > > > > > > 0,2/53
> > > > > > > > 11:10:22.313290 IP 10.30.13.2.916 > 10.31.3.13.2049: Flags
> > > > > > > > [P.], seq 14452:14652, ack 18561, win 501, options
> > > > > > > > [nop,nop,TS val
> > > > > > > > 834889898 ecr 1589769203], length 200: NFS request xid
> > > > > > > > 3614904256
> > > > > > > > 196 getattr fh
> > > > > > > > 0,2/53
> > > > > > > > 11:10:22.721269 IP 10.30.13.2.916 > 10.31.3.13.2049: Flags
> > > > > > > > [P.], seq 14452:14652, ack 18561, win 501, options
> > > > > > > > [nop,nop,TS val
> > > > > > > > 834890306 ecr 1589769203], length 200: NFS request xid
> > > > > > > > 3614904256
> > > > > > > > 196 getattr fh
> > > > > > > > 0,2/53
> > > > > > > > 11:10:23.569192 IP 10.30.13.2.916 > 10.31.3.13.2049: Flags
> > > > > > > > [P.], seq 14452:14652, ack 18561, win 501, options
> > > > > > > > [nop,nop,TS val
> > > > > > > > 834891154 ecr 1589769203], length 200: NFS request xid
> > > > > > > > 3614904256
> > > > > > > > 196 getattr fh
> > > > > > > > 0,2/53
> > > > > > > > 11:10:25.233212 IP 10.30.13.2.916 > 10.31.3.13.2049: Flags
> > > > > > > > [P.], seq 14452:14652, ack 18561, win 501, options
> > > > > > > > [nop,nop,TS val
> > > > > > > > 834892818 ecr 1589769203], length 200: NFS request xid
> > > > > > > > 3614904256
> > > > > > > > 196 getattr fh
> > > > > > > > 0,2/53
> > > > > > > > 11:10:28.497282 IP 10.30.13.2.916 > 10.31.3.13.2049: Flags
> > > > > > > > [P.], seq 14452:14652, ack 18561, win 501, options
> > > > > > > > [nop,nop,TS val
> > > > > > > > 834896082 ecr 1589769203], length 200: NFS request xid
> > > > > > > > 3614904256
> > > > > > > > 196 getattr fh
> > > > > > > > 0,2/53
> > > > > > > > 11:10:35.025219 IP 10.30.13.2.916 > 10.31.3.13.2049: Flags
> > > > > > > > [P.], seq 14452:14652, ack 18561, win 501, options
> > > > > > > > [nop,nop,TS val
> > > > > > > > 834902610 ecr 1589769203], length 200: NFS request xid
> > > > > > > > 3614904256
> > > > > > > > 196 getattr fh
> > > > > > > > 0,2/53
> > > > > > > > 11:10:48.337201 IP 10.30.13.2.916 > 10.31.3.13.2049: Flags
> > > > > > > > [P.], seq 14452:14652, ack 18561, win 501, options
> > > > > > > > [nop,nop,TS val
> > > > > > > > 834915922 ecr 1589769203], length 200: NFS request xid
> > > > > > > > 3614904256
> > > > > > > > 196 getattr fh
> > > > > > > > 0,2/53
> > > > > > > > 11:11:14.449303 IP 10.30.13.2.916 > 10.31.3.13.2049: Flags
> > > > > > > > [P.], seq 14452:14652, ack 18561, win 501, options
> > > > > > > > [nop,nop,TS val
> > > > > > > > 834942034 ecr 1589769203], length 200: NFS request xid
> > > > > > > > 3614904256
> > > > > > > > 196 getattr fh
> > > > > > > > 0,2/53
> > > > > > > > 11:12:08.721251 IP 10.30.13.2.916 > 10.31.3.13.2049: Flags
> > > > > > > > [P.], seq 14452:14652, ack 18561, win 501, options
> > > > > > > > [nop,nop,TS val
> > > > > > > > 834996306 ecr 1589769203], length 200: NFS request xid
> > > > > > > > 3614904256
> > > > > > > > 196 getattr fh
> > > > > > > > 0,2/53
> > > > > > > > 11:12:22.545394 IP 10.30.13.2.942 > 10.31.3.13.2049: Flags
> > > > > > > > [S], seq 1375256951, win 64240, options [mss
> > > > > > > > 1460,sackOK,TS val
> > > > > > > > 835010130 ecr 0,nop,wscale 7], length 0
> > > > > > > > 11:12:23.570199 IP 10.30.13.2.942 > 10.31.3.13.2049: Flags
> > > > > > > > [S], seq 1375256951, win 64240, options [mss
> > > > > > > > 1460,sackOK,TS val
> > > > > > > > 835011155 ecr 0,nop,wscale 7], length 0
> > > > > > > > 11:12:25.617284 IP 10.30.13.2.942 > 10.31.3.13.2049: Flags
> > > > > > > > [S], seq 1375256951, win 64240, options [mss
> > > > > > > > 1460,sackOK,TS val
> > > > > > > > 835013202 ecr 0,nop,wscale 7], length 0
> > > > > > > > 11:12:29.649219 IP 10.30.13.2.942 > 10.31.3.13.2049: Flags
> > > > > > > > [S], seq 1375256951, win 64240, options [mss
> > > > > > > > 1460,sackOK,TS val
> > > > > > > > 835017234 ecr 0,nop,wscale 7], length 0
> > > > > > > > 11:12:37.905274 IP 10.30.13.2.942 > 10.31.3.13.2049: Flags
> > > > > > > > [S], seq 1375256951, win 64240, options [mss
> > > > > > > > 1460,sackOK,TS val
> > > > > > > > 835025490 ecr 0,nop,wscale 7], length 0
> > > > > > > > 11:12:54.289212 IP 10.30.13.2.942 > 10.31.3.13.2049: Flags
> > > > > > > > [S], seq 1375256951, win 64240, options [mss
> > > > > > > > 1460,sackOK,TS val
> > > > > > > > 835041874 ecr 0,nop,wscale 7], length 0
> > > > > > > > 11:13:26.545304 IP 10.30.13.2.942 > 10.31.3.13.2049: Flags
> > > > > > > > [S], seq 1375256951, win 64240, options [mss
> > > > > > > > 1460,sackOK,TS val
> > > > > > > > 835074130 ecr 0,nop,wscale 7], length 0
> > > > > > > >
> > > > > > > > I tried changing tcp_retries2 as suggested in another
> > > > > > > > thread from this list:
> > > > > > > >
> > > > > > > > # echo 3 > /proc/sys/net/ipv4/tcp_retries2
> > > > > > > >
> > > > > > > > ...but it made no difference on either kernel.  The 2
> > > > > > > > minute timeout also doesn't seem to match with what I'd
> > > > > > > > calculate from the initial value of tcp_retries2, which
> > > > > > > > should give a much higher timeout.
> > > > > > > >
> > > > > > > > The only clue I've been able to find is in the retry=n
> > > > > > > > entry in the NFS
> > > > > > > > manpage:
> > > > > > > >
> > > > > > > > " For TCP the default is 3 minutes, but system TCP
> > > > > > > > connection timeouts will sometimes limit the timeout of
> > > > > > > > each retransmission to around
> > > > > > > > 2
> > > > > > > minutes."
> > > > > > > >
> > > > > > > > What I'm not able to make sense of:
> > > > > > > >  - The retry option says that it applies to mount
> > > > > > > > operations, not read/write operations.  However, in this
> > > > > > > > case I'm seeing the 2 minute delay on read/write
> > > > > > > > operations but *not* mount operations.
> > > > > > > >  - A couple of hours of searching didn't lead me to any
> > > > > > > > kernel settings that would result in a 2 minute timeout.
> > > > > > > >
> > > > > > > > Does anyone have any clues about a) what's happening and
> > > > > > > > b)
> > > > > > > > how to get our desired behaviour of being able to control
> > > > > > > > both mount and read/write timeouts down to a few seconds?
> > > > > > > >
> > > > > > > > Thanks.
> > > > > > >
> > > > > > > I thought that changing TCP_RTO_MAX in include/net/tcp.h
> > > > > > > from
> > > > > > > 120 to
> > > > > > > something smaller and recompiling the kernel would change
> > > > > > > the 2 minute timeout, but it had no effect.  I'm going to
> > > > > > > keep poking through the kernel code to see if there's a knob
> > > > > > > I can turn to change the 2 minute timeout, so that I can at
> > > > > > > least understand where it's coming from.
> > > > > > >
> > > > > > > Any hints as to where I should be looking?
> > > > > >
> > > > > > I believe I've made some progress with this today:
> > > > > >
> > > > > >  - Calls to rpc_create() from fs/nfs/client.c are sending an
> > > > > > rpc_timeout struct with their args.
> > > > > >  - rpc_create() does *not* pass the timeout on to
> > > > > > xprt_create_transport(), which then can't pass it on to
> > > > > > xs_setup_tcp().
> > > > > >  - xs_setup_tcp(), having no timeout passed to it, uses
> > > > > > xs_tcp_default_timeout instead.
> > > > > >  - changing xs_tcp_default_timeout changes the "ls" timeout
> > > > > > behaviour
> > > > > > I described above.
> > > > > >
> > > > > > In theory all of this means that the timeout simply needs to
> > > > > > be
> > > > > > passed
> > > > > > through and used instead of xs_tcp_default_timeout.  I'm going
> > > > > > to
> > > > > > give
> > > > > > this a try tomorrow.
> > > > > >
> > > > >
> > > > > That's a great root-cause analysis. The interlocking timeouts
> > > > > involved with
> > > > > NFS and its sockets can be really difficult to unwind.
> > > > >
> > > > > Is there a way to automate this testcase? That might be nice to
> > > > > have
> > > > > in
> > > > > xfstests or the nfstest suite.
> > > > >
> > > > > > Here's what I'm going to try first; I'm no C programmer,
> > > > > > though,
> > > > > > so
> > > > > > any advice or corrections you might have would be appreciated.
> > > > > >
> > > > > > Thanks.
> > > > > >
> > > > > > Andrew
> > > > > >
> > > > > > diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index
> > > > > > 0b0b9f1eed46..1350c1f489f7 100644
> > > > > > --- a/net/sunrpc/clnt.c
> > > > > > +++ b/net/sunrpc/clnt.c
> > > > > > @@ -532,6 +532,7 @@ struct rpc_clnt *rpc_create(struct
> > > > > > rpc_create_args
> > > > > > *args)
> > > > > >                 .addrlen = args->addrsize,
> > > > > >                 .servername = args->servername,
> > > > > >                 .bc_xprt = args->bc_xprt,
> > > > > > +               .timeout = args->timeout,
> > > > > >         };
> > > > > >         char servername[48];
> > > > > >         struct rpc_clnt *clnt;
> > > > > > diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
> > > > > > index
> > > > > > aaa5b2741b79..adc79d94b59e 100644
> > > > > > --- a/net/sunrpc/xprtsock.c
> > > > > > +++ b/net/sunrpc/xprtsock.c
> > > > > > @@ -3003,7 +3003,7 @@ static struct rpc_xprt
> > > > > > *xs_setup_tcp(struct
> > > > > > xprt_create *args)
> > > > > >         xprt->idle_timeout = XS_IDLE_DISC_TO;
> > > > > >
> > > > > >         xprt->ops = &xs_tcp_ops;
> > > > > > -       xprt->timeout = &xs_tcp_default_timeout;
> > > > > > +       xprt->timeout = args->timeout;
> > > > > >
> > > > > >         xprt->max_reconnect_timeout = xprt->timeout-
> > > > > > >to_maxval;
> > > > > >         xprt->connect_timeout = xprt->timeout->to_initval *
> > > > > >
> > > > >
> > > > > Looks like you're probably on the right track. You're missing a
> > > > > few
> > > > > things:
> > > > >
> > > > > You'll need to add a "timeout" field to struct xprt_create in
> > > > > include/linux/sunrpc/xprt.h, and there may be some other places
> > > > > that
> > > > > either
> > > > > need to set the timeout in that structure, or do something with
> > > > > that
> > > > > field
> > > > > when it's set.
> > > > >
> > > > > Once you have something that fixes your reproducer, go ahead and
> > > > > post it
> > > > > and we can help you work through whatever changes need to me
> > > > > made
> > > to
> > > > > make it work.
> > > > >
> > > > > Nice work!
> > > >
> > > > Thanks for the tip, that was helpful.
> > > >
> > > > Currently I'm fighting with kernel recompilation.  I decided to
> > > > make
> > > > it quicker by slimming down the config, but apparently I've
> > > > achieved
> > > > something which Google claims no one else has achieved:
> > > >
> > > > Errors on kernel make modules_install:
> > > >
> > > >   DEPMOD  /lib/modules/6.2.0-rc5-sunrpctimeo+
> > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > nfs4_disable_idmapping
> > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > nfs4_label_alloc
> > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > send_implementation_id
> > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > nfs_atomic_open
> > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > nfs_clear_verifier_delegated
> > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > nfs4_client_id_uniquifier
> > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > nfs4_dentry_operations
> > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > nfs_fscache_open_file
> > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > nfs4_fs_type
> > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > recover_lost_locks
> > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > nfs_callback_nr_threads
> > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > max_session_cb_slots
> > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > max_session_slots
> > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > nfs_idmap_cache_timeout
> > > > depmod: WARNING: /lib/modules/6.2.0-rc5-
> > > > sunrpctimeo+/kernel/fs/nfs/nfsv4.ko needs unknown symbol
> > > > nfs_callback_set_tcpport
> > > >
> > > > Errors on module load:
> > > >
> > > > [   94.008271] nfsv4: Unknown symbol nfs4_disable_idmapping (err -
> > > > 2)
> > > > [   94.008321] nfsv4: Unknown symbol nfs4_label_alloc (err -2)
> > > > [   94.008434] nfsv4: Unknown symbol send_implementation_id (err -
> > > > 2)
> > > > [   94.008446] nfsv4: Unknown symbol nfs_atomic_open (err -2)
> > > > [   94.008468] nfsv4: Unknown symbol nfs_clear_verifier_delegated
> > > > (err
> > > > -2)
> > > > [   94.008475] nfsv4: Unknown symbol nfs4_client_id_uniquifier
> > > > (err -
> > > > 2)
> > > > [   94.008501] nfsv4: Unknown symbol nfs4_dentry_operations (err -
> > > > 2)
> > > > [   94.008521] nfsv4: Unknown symbol nfs_fscache_open_file (err -
> > > > 2)
> > > > [   94.008566] nfsv4: Unknown symbol nfs4_fs_type (err -2)
> > > > [   94.008595] nfsv4: Unknown symbol recover_lost_locks (err -2)
> > > > [   94.008639] nfsv4: Unknown symbol nfs_callback_nr_threads (err
> > > > -2)
> > > > [   94.008654] nfsv4: Unknown symbol max_session_cb_slots (err -2)
> > > > [   94.008678] nfsv4: Unknown symbol max_session_slots (err -2)
> > > > [   94.008694] nfsv4: Unknown symbol nfs_idmap_cache_timeout (err
> > > > -2)
> > > > [   94.008709] nfsv4: Unknown symbol nfs_callback_set_tcpport (err
> > > > -2)
> > > >
> > > > I suspect I've turned something off in the config that I shouldn't
> > > > have, but I'm not sure what.  I see that one of the symbols
> > > > (nfs_clear_verifier_delegated) is in include/linux/nfs_fs.h, and
> > > > the
> > > > others are defined in fs/nfs/nfs4_fs.h, fs/nfs/super.c,
> > > > fs/nfs/dir.c,
> > > > fs/nfs/inode.c, fs/nfs/fscache.c, and fs/nfs/fs_context.c.  I'm
> > > > changing config options and recompiling to try to figure out what
> > > > I'm
> > > > missing, but at a couple of hours per compile and only a couple of
> > > > days a week to work on this it's slow going.  Any hints as to what
> > > > I
> > > > might be doing wrong would be appreciated.  😊
> > > >
> > >
> > > Looks like the ABI got broken when you turned off some options.
> > >
> > > Generally, if you just want to build a single module, then you want
> > > the
> > > .config to be _exactly_ the one that you used to build the kernel
> > > you're
> > > going to plug it into. Then to build the modules under fs/nfs you
> > > can
> > > do:
> > >
> > >     make modules_prepare
> > >     make M=fs/nfs
> > >
> > > ...and then drop the resulting .ko objects into the right place in
> > > /lib/modules.
> > >
> > > That said, it may be simpler to just build and work with a whole
> > > kernel
> > > for testing purposes. Working with an individual kmod can be a bit
> > > tricky unless you know what you're doing.
> > >
> > > Once you do the first, subsequent builds should be reasonably fast.
> >
> > I'm going to go back to a full kernel build with make oldconfig using
> > the distro's kernel config to try to avoid this latest issue, then try
> > what you've suggested to speed up recompiles.
> >
> > Since my changes are in net/sunrpc, should I be doing something like
> > this?
> >
> > make modules_prepare
> > make M=net/sunrpc
> > make M=fs/nfs
> >
> > Or do I not need to recompile nfs if I'm only touching the internals
> > of sunrpc?
> >
> > Thanks again.
> >
> > Andrew
> >
> >
> >
> 
> You shouldn't need to build both *UNLESS* you change the ABI. That
> includes stuff like number of arguments to an exported function, or the
> size or layout of particular structures or arrays that both modules
> might be working with, etc...
> 
> If you do that, then things can break in all sorts of "interesting" ways
> that can be very hard to track down. Without seeing your patch, it's
> hard to know whether you're breaking those rules here. YMMV, of course.
> 
> Again, doing a full kernel build is the safest way to avoid that sort of
> thing. I'd council against shortcuts here unless you know what you're
> doing. Let the machine do the work. ;)

In the end I had to do a fresh compile with a different EXTRAVERSION to get rid of the "Unknown symbol" errors.  I'm guessing it has something to do with leftovers in /boot or /lib/modules from an old build, but I didn't pursue it.

After a few tries, I finally got a version that didn't put random values in the timeout or make the kernel blow up.  Now the compiler gives me a warning about "assignment discards ‘const’ qualifier from pointer target type", but it works.  I'd be happy to get advice on how to make it better.  (Keeping in mind that I'm far from a C programmer.  :-) )

Here's the patch so far:

diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index b9f59aabee53..5b7fdaff9267 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -333,6 +333,7 @@ struct xprt_create {
        struct svc_xprt         *bc_xprt;       /* NFSv4.1 backchannel */
        struct rpc_xprt_switch  *bc_xps;
        unsigned int            flags;
+       struct rpc_timeout      *timeout;       /* timeout parms */
 };

 struct xprt_class {
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 0b0b9f1eed46..1350c1f489f7 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -532,6 +532,7 @@ struct rpc_clnt *rpc_create(struct rpc_create_args *args)
                .addrlen = args->addrsize,
                .servername = args->servername,
                .bc_xprt = args->bc_xprt,
+               .timeout = args->timeout,
        };
        char servername[48];
        struct rpc_clnt *clnt;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index aaa5b2741b79..0011596be4ad 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2174,6 +2174,17 @@ static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt,
        unsigned int keepcnt;
        unsigned int timeo;

+       dprintk("xs_tcp_set_socket_timeouts: xprt->timeout->to_initval: %lu\n",
+               xprt->timeout->to_initval);
+       dprintk("xs_tcp_set_socket_timeouts: xprt->timeout->to_maxval: %lu\n",
+               xprt->timeout->to_maxval);
+       dprintk("xs_tcp_set_socket_timeouts: xprt->timeout->to_increment: %lu\n",
+               xprt->timeout->to_increment);
+       dprintk("xs_tcp_set_socket_timeouts: xprt->timeout->to_retries: %u\n",
+               xprt->timeout->to_retries);
+       dprintk("xs_tcp_set_socket_timeouts: xprt->timeout->to_exponential: %u\n",
+               xprt->timeout->to_exponential);
+
        spin_lock(&xprt->transport_lock);
        keepidle = DIV_ROUND_UP(xprt->timeout->to_initval, HZ);
        keepcnt = xprt->timeout->to_retries + 1;
@@ -3003,7 +3014,24 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
        xprt->idle_timeout = XS_IDLE_DISC_TO;

        xprt->ops = &xs_tcp_ops;
-       xprt->timeout = &xs_tcp_default_timeout;
+
+       if (args->timeout) {
+               xprt->timeout = kmalloc(sizeof(struct rpc_timeout), GFP_KERNEL);
+               memcpy(xprt->timeout, args->timeout, sizeof(struct rpc_timeout));
+       } else {
+               xprt->timeout = &xs_tcp_default_timeout;
+       }
+
+       dprintk("xs_setup_tcp: xprt->timeout->to_initval: %lu\n",
+               xprt->timeout->to_initval);
+       dprintk("xs_setup_tcp: xprt->timeout->to_maxval: %lu\n",
+               xprt->timeout->to_maxval);
+       dprintk("xs_setup_tcp: xprt->timeout->to_increment: %lu\n",
+               xprt->timeout->to_increment);
+       dprintk("xs_setup_tcp: xprt->timeout->to_retries: %u\n",
+               xprt->timeout->to_retries);
+       dprintk("xs_setup_tcp: xprt->timeout->to_exponential: %u\n",
+               xprt->timeout->to_exponential);
        xprt->max_reconnect_timeout = xprt->timeout->to_maxval;
        xprt->connect_timeout = xprt->timeout->to_initval *





[Index of Archives]     [Linux Filesystem Development]     [Linux USB Development]     [Linux Media Development]     [Video for Linux]     [Linux NILFS]     [Linux Audio Users]     [Yosemite Info]     [Linux SCSI]

  Powered by Linux