Following nbd and iscsi, commit 89baaa570ab0 ("libceph: use memalloc flags for net IO") set SOCK_MEMALLOC and PF_MEMALLOC flags for rbd and cephfs. However it turned out to not play nice with loopback scenario, leading to lockups with a full socket send-q and empty recv-q. While we always advised against colocating kernel client and ceph servers on the same box, a few people are doing it and it's also useful for light development testing, so rather than reverting make sure to not set those flags in the loopback case. Cc: Mike Christie <michaelc@xxxxxxxxxxx> Cc: Mel Gorman <mgorman@xxxxxxx> Cc: Sage Weil <sage@xxxxxxxxxx> Cc: stable@xxxxxxxxxxxxxxx # 3.18+, needs backporting Signed-off-by: Ilya Dryomov <idryomov@xxxxxxxxx> --- net/ceph/messenger.c | 40 +++++++++++++++++++++++++++++++++++++--- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 6b3f54ed65ba..9fa2cce71164 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -101,6 +101,7 @@ #define CON_FLAG_WRITE_PENDING 2 /* we have data ready to send */ #define CON_FLAG_SOCK_CLOSED 3 /* socket state changed to closed */ #define CON_FLAG_BACKOFF 4 /* need to retry queuing delayed work */ +#define CON_FLAG_LOCAL 5 /* using loopback interface */ static bool con_flag_valid(unsigned long con_flag) { @@ -110,6 +111,7 @@ static bool con_flag_valid(unsigned long con_flag) case CON_FLAG_WRITE_PENDING: case CON_FLAG_SOCK_CLOSED: case CON_FLAG_BACKOFF: + case CON_FLAG_LOCAL: return true; default: return false; @@ -470,6 +472,18 @@ static void set_sock_callbacks(struct socket *sock, * socket helpers */ +static bool sk_is_loopback(struct sock *sk) +{ + struct dst_entry *dst = sk_dst_get(sk); + bool ret = false; + + if (dst) { + ret = dst->dev && (dst->dev->flags & IFF_LOOPBACK); + dst_release(dst); + } + return ret; +} + /* * initiate connection to a remote socket. */ @@ -484,7 +498,7 @@ static int ceph_tcp_connect(struct ceph_connection *con) IPPROTO_TCP, &sock); if (ret) return ret; - sock->sk->sk_allocation = GFP_NOFS | __GFP_MEMALLOC; + sock->sk->sk_allocation = GFP_NOFS; #ifdef CONFIG_LOCKDEP lockdep_set_class(&sock->sk->sk_lock, &socket_class); @@ -510,6 +524,11 @@ static int ceph_tcp_connect(struct ceph_connection *con) return ret; } + if (sk_is_loopback(sock->sk)) + con_flag_set(con, CON_FLAG_LOCAL); + else + con_flag_clear(con, CON_FLAG_LOCAL); + if (con->msgr->tcp_nodelay) { int optval = 1; @@ -520,7 +539,18 @@ static int ceph_tcp_connect(struct ceph_connection *con) ret); } - sk_set_memalloc(sock->sk); + /* + * Tagging with SOCK_MEMALLOC / setting PF_MEMALLOC may lead to + * lockups if our peer is on the same host (communicating via + * loopback) due to sk_filter() mercilessly dropping pfmemalloc + * skbs on the receiving side - receiving loopback socket is + * not going to be tagged with SOCK_MEMALLOC. See: + * + * - http://article.gmane.org/gmane.linux.kernel/1418791 + * - http://article.gmane.org/gmane.linux.kernel.stable/46128 + */ + if (!con_flag_test(con, CON_FLAG_LOCAL)) + sk_set_memalloc(sock->sk); con->sock = sock; return 0; @@ -2811,7 +2841,11 @@ static void con_work(struct work_struct *work) unsigned long pflags = current->flags; bool fault; - current->flags |= PF_MEMALLOC; + /* + * See SOCK_MEMALLOC comment in ceph_tcp_connect(). + */ + if (!con_flag_test(con, CON_FLAG_LOCAL)) + current->flags |= PF_MEMALLOC; mutex_lock(&con->mutex); while (true) { -- 1.9.3 -- To unsubscribe from this list: send the line "unsubscribe ceph-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html