Hi, here is a patch in two versions, one for mainline and the second for SLES11 SP1. I don't know, whether this is the best solution, but the version for SLES11 fixes the problem for me. It already is tested for about 12 hours and I just started the test again to have it run for the entire weekend. The patch for mainline is untested. Bodo ################################ From: Bodo Stroesser <bstroesser@xxxxxxxxxxxxxx> Date: Fri, 15 Mar 2013 Subject: [PATCH] net: sunrpc: fix race in RPC cache This patch applies to linux-3.7.6 plus 2 earlier patches that also fix races If between a thread's calls to sunrpc_cache_lookup() and cache_check() the cache entry returned by lookup is modified by a) sunrpc_cache_update() setting expiry_time is set to 0 b) cache_clean() unhashing it, as it now is expired then the thread will perform an upcall for the outdated cache entry. That cache_request will stay on the queue for ever, as a reply to the request will always match a newer cache entry and the request isn't dequeued. OTOH, cache_clean() already unhashed the old entry, thus cache_clean() won't dequeue the request also. To fix the problem, in cache_check() we recheck the state of the cache entry after having set CACHE_PENDING. So we are sure to never do an upcall for an outdated entry. Signed-off-by: Bodo Stroesser <bstroesser@xxxxxxxxxxxxxx> --- --- a/net/sunrpc/cache.c 2013-03-15 21:14:54.000000000 +0100 +++ b/net/sunrpc/cache.c 2013-03-15 21:29:09.000000000 +0100 @@ -222,6 +222,17 @@ static inline int cache_is_valid(struct } } +static inline int cache_wants_upcall(struct cache_head *h, int final) +{ + long refresh_age = (h->expiry_time - h->last_refresh); + long age = seconds_since_boot() - h->last_refresh; + + if (final) + dprintk("RPC: Want update, refage=%ld, age=%ld\n", refresh_age, age); + + return h->expiry_time && age > refresh_age/2; +} + static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h) { int rv; @@ -255,24 +266,22 @@ static int try_to_negate_entry(struct ca int cache_check(struct cache_detail *detail, struct cache_head *h, struct cache_req *rqstp) { - int rv; - long refresh_age, age; + int rv, rc; /* First decide return status as best we can */ rv = cache_is_valid(detail, h); - /* now see if we want to start an upcall */ - refresh_age = (h->expiry_time - h->last_refresh); - age = seconds_since_boot() - h->last_refresh; - if (rqstp == NULL) { if (rv == -EAGAIN) rv = -ENOENT; - } else if (rv == -EAGAIN || age > refresh_age/2) { - dprintk("RPC: Want update, refage=%ld, age=%ld\n", - refresh_age, age); + } else if (rv == -EAGAIN || cache_wants_upcall(h, 0)) { if (!test_and_set_bit(CACHE_PENDING, &h->flags)) { - switch (cache_make_upcall(detail, h)) { + rv = cache_is_valid(detail, h); + if (rv == -EAGAIN || cache_wants_upcall(h, 1)) + rc = cache_make_upcall(detail, h); + else + rc = -EAGAIN; + switch (rc) { case -EINVAL: rv = try_to_negate_entry(detail, h); break; ################################ From: Bodo Stroesser <bstroesser@xxxxxxxxxxxxxx> Date: Fri, 15 Mar 2013 Subject: [PATCH] net: sunrpc: fix race in RPC cache This patch applies to linux-2.6.32.59-0.7.1.4543.0.PTF plus 2 earlier patches that also fix races If between a thread's calls to sunrpc_cache_lookup() and cache_check() the cache entry returned by lookup is modified by a) sunrpc_cache_update() setting expiry_time is set to 0 b) cache_clean() unhashing it, as it now is expired then the thread will perform an upcall for the outdated cache entry. That cache_request will stay on the queue for ever, as a reply to the request will always match a newer cache entry and the request isn't dequeued. OTOH, cache_clean() already unhashed the old entry, thus cache_clean() won't dequeue the request also. On this older SW version, the problem additionally leads to RPC requests being dropped. To fix the problem, in cache_check() we recheck the state of the cache entry after having set CACHE_PENDING. So we are sure to never do an upcall for an outdated entry. Signed-off-by: Bodo Stroesser <bstroesser@xxxxxxxxxxxxxx> --- --- a/net/sunrpc/cache.c 2013-03-15 20:03:54.000000000 +0100 +++ b/net/sunrpc/cache.c 2013-03-15 20:14:13.000000000 +0100 @@ -184,7 +184,7 @@ static int cache_make_upcall(struct cach static inline int cache_is_valid(struct cache_detail *detail, struct cache_head *h) { if (!test_bit(CACHE_VALID, &h->flags) || - h->expiry_time < monotonic_seconds()) + (h->expiry_time && h->expiry_time < monotonic_seconds())) return -EAGAIN; else if (detail->flush_time > h->last_refresh) return -EAGAIN; @@ -197,6 +197,17 @@ static inline int cache_is_valid(struct } } +static inline int cache_wants_upcall(struct cache_head *h, int final) +{ + long refresh_age = (h->expiry_time - h->last_refresh); + long age = monotonic_seconds() - h->last_refresh; + + if (final) + dprintk("RPC: Want update, refage=%ld, age=%ld\n", refresh_age, age); + + return h->expiry_time && age > refresh_age/2; +} + /* * This is the generic cache management routine for all * the authentication caches. @@ -214,24 +225,22 @@ static inline int cache_is_valid(struct int cache_check(struct cache_detail *detail, struct cache_head *h, struct cache_req *rqstp) { - int rv; - long refresh_age, age; + int rv, rc; /* First decide return status as best we can */ rv = cache_is_valid(detail, h); - /* now see if we want to start an upcall */ - refresh_age = (h->expiry_time - h->last_refresh); - age = monotonic_seconds() - h->last_refresh; - if (rqstp == NULL) { if (rv == -EAGAIN) rv = -ENOENT; - } else if (rv == -EAGAIN || age > refresh_age/2) { - dprintk("RPC: Want update, refage=%ld, age=%ld\n", - refresh_age, age); + } else if (rv == -EAGAIN || cache_wants_upcall(h, 0)) { if (!test_and_set_bit(CACHE_PENDING, &h->flags)) { - switch (cache_make_upcall(detail, h)) { + rv = cache_is_valid(detail, h); + if (rv == -EAGAIN || cache_wants_upcall(h, 1)) + rc = cache_make_upcall(detail, h); + else + rc = -EAGAIN; + switch (rc) { case -EINVAL: write_lock(&detail->hash_lock); if (rv) { ÿôèº{.nÇ+?·?®??+%?Ëÿ±éݶ¥?wÿº{.nÇ+?·¥?{±þwìþ)í?æèw*jg¬±¨¶????Ý¢jÿ¾«þG«?éÿ¢¸¢·¦j:+v?¨?wèjØm¶?ÿþø¯ù®w¥þ?àþf£¢·h??â?úÿ?Ù¥