From: Pavel Emelyanov <xemul@xxxxxxxxxxxxx> Co-authored-by: Dmitry V. Levin <ldv@xxxxxxxxxxxx> Signed-off-by: Pavel Emelyanov <xemul@xxxxxxxxxxxxx> Signed-off-by: Dmitry V. Levin <ldv@xxxxxxxxxxxx> --- man7/sock_diag.7 | 865 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 865 insertions(+) create mode 100644 man7/sock_diag.7 diff --git a/man7/sock_diag.7 b/man7/sock_diag.7 new file mode 100644 index 0000000..c4948d7 --- /dev/null +++ b/man7/sock_diag.7 @@ -0,0 +1,865 @@ +.\" Copyright (c) 2016 Pavel Emelyanov <xemul@xxxxxxxxxxxxx> +.\" Copyright (c) 2016 Dmitry V. Levin <ldv@xxxxxxxxxxxx> +.\" +.\" %%%LICENSE_START(GPLv2+_DOC_FULL) +.\" This is free documentation; you can redistribute it and/or +.\" modify it under the terms of the GNU General Public License as +.\" published by the Free Software Foundation; either version 2 of +.\" the License, or (at your option) any later version. +.\" +.\" The GNU General Public License's references to "object code" +.\" and "executables" are to be interpreted as the output of any +.\" document formatting or typesetting system, including +.\" intermediate and printed output. +.\" +.\" This manual is distributed in the hope that it will be useful, +.\" but WITHOUT ANY WARRANTY; without even the implied warranty of +.\" MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +.\" GNU General Public License for more details. +.\" +.\" You should have received a copy of the GNU General Public +.\" License along with this manual; if not, see +.\" <http://www.gnu.org/licenses/>. +.\" %%%LICENSE_END +.TH SOCK_DIAG 7 2016-12-07 "Linux" "Linux Programmer's Manual" +.SH NAME +sock_diag \- obtaining information about sockets +.SH SYNOPSIS +.nf +.B #include <sys/socket.h> +.B #include <linux/sock_diag.h> +.BR "#include <linux/unix_diag.h>" " /* for UNIX domain sockets */" +.BR "#include <linux/inet_diag.h>" " /* for IPv4 and IPv6 sockets */" + +.BI "diag_socket = socket(AF_NETLINK, " socket_type ", NETLINK_SOCK_DIAG);" +.fi +.SH DESCRIPTION +The sock_diag netlink subsystem provides a mechanism for obtaining +information about sockets of various protocol families from the kernel. +This subsystem can be used to obtain information about individual +sockets or request a list of sockets. + +In the request the caller can specify an additional information it would +like to obtain about the socket, e.g. memory information or +family-specific stuff. + +When requesting a list of sockets, the caller can specify filters that +would be applied by the kernel to select a subset of sockets to report. +For now there is only an ability to filter sockets by state (connected, +listening, etc.) + +Note that sock_diag reports only those sockets that have a name, +i.e. either bound explicitly with +.BR bind (2) +or sockets that were automatically bound to an address (e.g., by +.BR connect (2)). +This is the same set of sockets that is available via +.IR /proc/net/unix , +.IR /proc/net/tcp , +.IR /proc/net/udp , +etc. + +.SS Request +The request starts with +.I "struct nlmsghdr" +header described in +.BR netlink (7) +with +.I nlmsg_type +field set to +.BR SOCK_DIAG_BY_FAMILY . +It is followed by a protocol family specific header that starts with +a common part shared by all protocol families: + +.in +4n +.nf +struct sock_diag_req { + __u8 sdiag_family; + __u8 sdiag_protocol; +}; +.fi +.in +.PP +The fields of this structure are as follows: +.TP +.I sdiag_family +A protocol family. It should be set to the appropriate +.B AF_* +constant. +.TP +.I sdiag_protocol +Depends on +.IR sdiag_family . +It should be set to the appropriate +.B IPPROTO_* +constant for +.B AF_INET +and +.BR AF_INET6, +and to 0 otherwise. +.PP +If +.I nlmsg_flags +field of the +.I "struct nlmsghdr" +header has +.BR NLM_F_DUMP +flag set, it means that a list of sockets is being requested, +otherwise it is a query about an individual socket. + +.SS Response +The response starts with +.I "struct nlmsghdr" +header and is followed by an array of family-specific objects. +The array is to be accessed with the standard +.B NLMSG_* +macros from +.BR netlink (3) +API. +.PP +Each object is the NLA (netlink attributes) list that is to be accessed +with the +.B RTA_* +macros from +.BR rtnetlink (3) +API. + +.SS UNIX domain sockets +For UNIX domain sockets the request is represented in the following structure: + +.in +4n +.nf +struct unix_diag_req { + __u8 sdiag_family; + __u8 sdiag_protocol; + __u16 pad; + __u32 udiag_states; + __u32 udiag_ino; + __u32 udiag_show; + __u32 udiag_cookie[2]; +}; +.fi +.in +.PP +The fields of this structure are as follows: +.TP +.I sdiag_family +The protocol family, it should be set to +.BR AF_UNIX . +.PP +.I sdiag_protocol +.PD 0 +.TP +.PD +.I pad +These fields should be set to 0. +.TP +.I udiag_states +This is a bit mask that defines a filter of sockets states. +Only those sockets whose states are in this mask will be reported. +Ignored when querying for an individual socket. +Supported values are: +.PD 0 +.RS +.IP "" 2 +1 << +.B TCP_ESTABLISHED +.IP +1 << +.B TCP_LISTEN +.RE +.PD +.TP +.I udiag_ino +This is an inode number when querying for an individual socket. +Ignored when querying for a list of sockets. +.TP +.I udiag_show +This is a set of flags defining what kind of information to report. +Each requested kind of information is reported back as a netlink +attribute as described below: +.RS +.IP "" 2 +.B UDIAG_SHOW_NAME +.RS 4 +The attribute reported in answer to this request is +.BR UNIX_DIAG_NAME . +The payload associated with this attribute is the pathname to which +the socket was bound (a sequence of bytes up to +.B UNIX_PATH_MAX +length). +.RE +.IP "" 2 +.B UDIAG_SHOW_VFS +.RS 4 +The attribute reported in answer to this request is +.BR UNIX_DIAG_VFS . +The payload associated with this attribute is represented in the following +structure: + +.in +4n +.nf +struct unix_diag_vfs { + __u32 udiag_vfs_dev; + __u32 udiag_vfs_ino; +}; +.fi +.in + +The fields of this structure are as follows: +.PD 0 +.RS 2 +.IP "" 2 +.I udiag_vfs_dev +The device number of the corresponding on-disk socket inode. +.IP +.I udiag_vfs_ino +The inode number of the corresponding on-disk socket inode. +.RE +.PD +.RE +.IP +.B UDIAG_SHOW_PEER +.RS 4 +The attribute reported in answer to this request is +.BR UNIX_DIAG_PEER . +The payload associated with this attribute is a __u32 value +which is the peer's inode number. +This attribute is reported for connected sockets only. +.RE +.IP +.B UDIAG_SHOW_ICONS +.RS 4 +The attribute reported in answer to this request is +.BR UNIX_DIAG_ICONS . +The payload associated with this attribute is an array of __u32 values +which are inode numbers of sockets that has passed the +.BR connect (2) +call, but hasn't been processed with +.BR accept (2) +yet. This attribute is reported for listening sockets only. +.RE +.IP +.B UDIAG_SHOW_RQLEN +.RS 4 +The attribute reported in answer to this request is +.BR UNIX_DIAG_RQLEN . +The payload associated with this attribute is represented in the following +structure: + +.in +4n +.nf +struct unix_diag_rqlen { + __u32 udiag_rqueue; + __u32 udiag_wqueue; +}; +.fi +.in + +The fields of this structure are as follows: +.PD 0 +.RS 2 +.IP "" 2 +.I udiag_rqueue +.RS 6 +.IP "listening sockets:" 2 +The number of pending connections. The length of the array associated with +.B UNIX_DIAG_ICONS +response attribute is equal to this value. +.IP "established sockets:" +The amount of data in incoming queue. +.RE +.IP +.I udiag_wqueue +.RS 6 +.IP "listening sockets:" 2 +The backlog length which equals to the value passed as the second argument to +.BR listen (2). +.IP "established sockets:" +The amount of memory available for sending. +.RE +.RE +.PD +.RE +.IP +.B UDIAG_SHOW_MEMINFO +.RS 4 +The attribute reported in answer to this request is +.BR UNIX_DIAG_MEMINFO . +The payload associated with this attribute is an array of __u32 values +described below in "Socket memory information" subsection. +.RE +.IP +.RE +.RS +The following attributes are reported back without any specific request: +.IP "" 2 +.BR UNIX_DIAG_SHUTDOWN . +The payload associated with this attribute is __u8 value which represents +bits of +.BR shutdown (2) +state. +.RE +.TP +.I udiag_cookie +This is an array of opaque identifiers that could be used along with +.I udiag_ino +to specify an individual socket. It is ignored when querying for a list +of sockets, as well as when all its elements are set to \-1. +.PP +The response to a query for UNIX domain sockets is represented as an array of + +.in +4n +.nf +struct unix_diag_msg { + __u8 udiag_family; + __u8 udiag_type; + __u8 udiag_state; + __u8 pad; + __u32 udiag_ino; + __u32 udiag_cookie[2]; +}; +.fi +.in + +followed by netlink attributes. +.PP +The fields of this structure are as follows: +.TP +.I udiag_family +This field has the same meaning as in +.IR "struct unix_diag_req" . +.TP +.I udiag_type +This is set to one of the following constants: +.PD 0 +.RS +.IP "" 2 +.B SOCK_PACKET +.IP +.B SOCK_STREAM +.IP +.B SOCK_SEQPACKET +.RE +.PD +.TP +.I udiag_state +This is set to one of the following constants: +.PD 0 +.RS +.IP "" 2 +.B TCP_LISTEN +.IP +.B TCP_ESTABLISHED +.RE +.PD +.TP +.I pad +This field is set to 0. +.TP +.I udiag_ino +This is the socket inode number. +.TP +.I udiag_cookie +This is an array of opaque identifiers that could be used in subsequent +queries. + +.SS IPv4 and IPv6 sockets +For IPv4 and IPv6 sockets the request is represented in the following structure: + +.in +4n +.nf +struct inet_diag_req_v2 { + __u8 sdiag_family; + __u8 sdiag_protocol; + __u8 idiag_ext; + __u8 pad; + __u32 idiag_states; + struct inet_diag_sockid id; +}; +.fi +.in + +where +.I "struct inet_diag_sockid" +is defined as follows: + +.in +4n +.nf +struct inet_diag_sockid { + __be16 idiag_sport; + __be16 idiag_dport; + __be32 idiag_src[4]; + __be32 idiag_dst[4]; + __u32 idiag_if; + __u32 idiag_cookie[2]; +}; +.fi +.in +.PP +The fields of +.I "struct inet_diag_req_v2" +are as follows: +.TP +.I sdiag_family +This should be set to either +.B AF_INET +or +.B AF_INET6 +for +.B IPv4 +or +.B IPv6 +sockets respectively. +.TP +.I sdiag_protocol +This should be set to one of the following constants: +.PD 0 +.RS +.IP "" 2 +.B IPPROTO_TCP +.IP +.B IPPROTO_UDP +.IP +.B IPPROTO_UDPLITE +.RE +.PD +.TP +.I idiag_ext +This is a set of flags defining what kind of extended information to report. +Each requested kind of information is reported back as a netlink attribute +as described below: +.RS +.TP +.B INET_DIAG_TOS +The payload associated with this attribute is a __u8 value +which is the TOS of the socket. +.TP +.B INET_DIAG_TCLASS +The payload associated with this attribute is a __u8 value +which is the TClass of the socket. IPv6 sockets only. +For LISTEN and CLOSE sockets this is followed by +.B INET_DIAG_SKV6ONLY +attribute with associated __u8 payload value meaning whether the socket +is IPv6-only or not. +.TP +.B INET_DIAG_MEMINFO +The payload associated with this attribute is represented in the following +structure: + +.in +4n +.nf +struct inet_diag_meminfo { + __u32 idiag_rmem; + __u32 idiag_wmem; + __u32 idiag_fmem; + __u32 idiag_tmem; +}; +.fi +.in + +The fields of this structure are as follows: +.RS +.TP 12 +.I idiag_rmem +The amount of data in the receive queue. +.TP +.I idiag_wmem +The amount of data that is queued by TCP but not yet sent. +.TP +.I idiag_fmem +The amount of memory scheduled for future use (TCP only). +.TP +.I idiag_tmem +The amount of data in send queue. +.RE +.TP +.B INET_DIAG_SKMEMINFO +The payload associated with this attribute is an array of __u32 values +described below in "Socket memory information" subsection. +.TP +.B INET_DIAG_INFO +The payload associated with this attribute is protocol specific. +For TCP sockets it is an object of type +.IR "struct tcp_info" . +.TP +.B INET_DIAG_CONG +The payload associated with this attribute is a string that describes the +congestion control algorithm used. For TCP sockets only. +.RE +.TP +.I pad +This should be set to 0. +.TP +.I idiag_states +This is a bit mask that defines a filter of sockets states. +Only those sockets whose states are in this mask will be reported. +Ignored when querying for an individual socket. +.TP +.I id +This is a socket id object that is used in dump requests, in queries +about individual sockets, and is reported back in each response. +Unlike UNIX domain sockets, IPv4 and IPv6 sockets are identified +using addresses and ports. All values are in network byte order. +.PP +The fields of +.I "struct inet_diag_sockid" +are as follows: +.TP +.I idiag_sport +The source port. +.TP +.I idiag_dport +The destination port. +.TP +.I idiag_src +The source address. +.TP +.I idiag_dst +The destination address. +.TP +.I idiag_if +The interface number the socket is bound to. +.TP +.I idiag_cookie +This is an array of opaque identifiers that could be used along with +other fields of this structure to specify an individual socket. +It is ignored when querying for a list of sockets, as well as +when all its elements are set to \-1. +.PP +The response to a query for IPv4 or IPv6 sockets is represented as an array of + +.in +4n +.nf +struct inet_diag_msg { + __u8 idiag_family; + __u8 idiag_state; + __u8 idiag_timer; + __u8 idiag_retrans; + + struct inet_diag_sockid id; + + __u32 idiag_expires; + __u32 idiag_rqueue; + __u32 idiag_wqueue; + __u32 idiag_uid; + __u32 idiag_inode; +}; +.fi +.in + +followed by netlink attributes. +.PP +The fields of this structure are as follows: +.TP +.I idiag_family +This is the same field as in +.IR "struct inet_diag_req_v2" . +.TP +.I idiag_state +This denotes socket state as in +.IR "struct inet_diag_req_v2" . +.TP +.I idiag_timer +For TCP sockets, this field describes the type of timer that is currently +active for the socket. It is set to one of the following constants: +.RS +.TP +.B 0 +no timer is active +.TP +.B 1 +a retransmit timer +.TP +.B 2 +a keep-alive timer +.TP +.B 3 +a TIME_WAIT timer +.TP +.B 4 +a zero window probe timer +.RE +.IP +.RS +For non-TCP sockets this field is set to 0. +.RE +.TP +.I idiag_retrans +For +.I idiag_timer +values 1, 2, and 4 this field contains the number of retransmits. For other +.I idiag_timer +values this field is set to 0. +.TP +.I idiag_expires +For TCP sockets that have an active timer this field describes its expiration +time in milliseconds. For other sockets this field is set to 0. +.TP +.I idiag_rqueue +.RS 7 +.IP "listening sockets:" 2 +The number of pending connections. +.IP "other sockets:" +The amount of data in incoming queue. +.RE +.TP +.I idiag_wqueue +.RS 7 +.IP "listening sockets:" 2 +The backlog length. +.IP "other sockets:" +The amount of memory available for sending. +.RE +.TP +.I idiag_uid +This is the socket owner UID. +.TP +.I idiag_inode +This is the socket inode number. + +.SS Socket memory information +The payload associated with +.B UNIX_DIAG_MEMINFO +and +.BR INET_DIAG_SKMEMINFO +netlink attributes is an array of the following __u32 values: +.TP +.B SK_MEMINFO_RMEM_ALLOC +The amount of data in receive queue. +.TP +.B SK_MEMINFO_RCVBUF +The receive socket buffer as set by +.BR SO_RCVBUF . +.TP +.B SK_MEMINFO_WMEM_ALLOC +The amount of data in send queue. +.TP +.B SK_MEMINFO_SNDBUF +The send socket buffer as set by +.BR SO_SNDBUF . +.TP +.B SK_MEMINFO_FWD_ALLOC +The amount of memory scheduled for future use (TCP only). +.TP +.B SK_MEMINFO_WMEM_QUEUED +The amount of data queued by TCP, but not yet sent. +.TP +.B SK_MEMINFO_OPTMEM +The amount of memory allocated for socket's service needs (e.g. socket +filter). +.TP +.B SK_MEMINFO_BACKLOG +The amount of packets in the backlog (not yet processed). +.SH CONFORMING TO +The NETLINK_SOCK_DIAG API is Linux-specific. +.SH VERSIONS +.B NETLINK_INET_DIAG +was introduced in Linux 2.6.14 and supported +.B AF_INET +and +.B AF_INET6 +sockets only. In Linux 3.3 it was renamed to +.B NETLINK_SOCK_DIAG +and extended to support +.B AF_UNIX +sockets. +.PP +.B UNIX_DIAG_MEMINFO +and +.BR INET_DIAG_SKMEMINFO +were introduced in Linux 3.6. +.SH EXAMPLE +The following example program prints inode number, peer's inode number, +and name of all UNIX domain sockets in the current namespace. + +.nf +#include <errno.h> +#include <stdio.h> +#include <string.h> +#include <unistd.h> +#include <sys/socket.h> +#include <sys/un.h> +#include <linux/netlink.h> +#include <linux/rtnetlink.h> +#include <linux/sock_diag.h> +#include <linux/unix_diag.h> + +static int +send_query(int fd) +{ + struct sockaddr_nl nladdr = { + .nl_family = AF_NETLINK + }; + struct + { + struct nlmsghdr nlh; + struct unix_diag_req udr; + } req = { + .nlh = { + .nlmsg_len = sizeof(req), + .nlmsg_type = SOCK_DIAG_BY_FAMILY, + .nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP + }, + .udr = { + .sdiag_family = AF_UNIX, + .udiag_states = \-1, + .udiag_show = UDIAG_SHOW_NAME | UDIAG_SHOW_PEER + } + }; + struct iovec iov = { + .iov_base = &req, + .iov_len = sizeof(req) + }; + struct msghdr msg = { + .msg_name = (void *) &nladdr, + .msg_namelen = sizeof(nladdr), + .msg_iov = &iov, + .msg_iovlen = 1 + }; + + for (;;) { + if (sendmsg(fd, &msg, 0) < 0) { + if (errno == EINTR) + continue; + perror("sendmsg"); + return \-1; + } + return 0; + } +} + +static int +print_diag(const struct unix_diag_msg *diag, unsigned int len) +{ + if (len < NLMSG_LENGTH(sizeof(*diag))) { + fputs("short response\\n", stderr); + return \-1; + } + if (diag\->udiag_family != AF_UNIX) { + fprintf(stderr, "unexpected family %u\\n", diag\->udiag_family); + return \-1; + } + + struct rtattr *attr; + unsigned int rta_len = len \- NLMSG_LENGTH(sizeof(*diag)); + unsigned int peer = 0; + size_t path_len = 0; + char path[sizeof(((struct sockaddr_un *) 0)\->sun_path) + 1]; + + for (attr = (struct rtattr *) (diag + 1); + RTA_OK(attr, rta_len); attr = RTA_NEXT(attr, rta_len)) { + switch (attr\->rta_type) { + case UNIX_DIAG_NAME: + if (!path_len) { + path_len = RTA_PAYLOAD(attr); + if (path_len > sizeof(path) \- 1) + path_len = sizeof(path) \- 1; + memcpy(path, RTA_DATA(attr), path_len); + path[path_len] = '\\0'; + } + break; + case UNIX_DIAG_PEER: + if (RTA_PAYLOAD(attr) >= sizeof(peer)) + peer = *(unsigned int *) RTA_DATA(attr); + break; + } + } + + printf("inode=%u", diag->udiag_ino); + + if (peer) + printf(", peer=%u", peer); + + if (path_len) + printf(", name=%s%s", *path ? "" : "@", *path ? path : path + 1); + + putchar('\\n'); + return 0; +} + +static int +receive_responses(int fd) +{ + long buf[8192 / sizeof(long)]; + struct sockaddr_nl nladdr = { + .nl_family = AF_NETLINK + }; + struct iovec iov = { + .iov_base = buf, + .iov_len = sizeof(buf) + }; + int flags = 0; + + for (;;) { + struct msghdr msg = { + .msg_name = (void *) &nladdr, + .msg_namelen = sizeof(nladdr), + .msg_iov = &iov, + .msg_iovlen = 1 + }; + + ssize_t ret = recvmsg(fd, &msg, flags); + + if (ret < 0) { + if (errno == EINTR) + continue; + perror("recvmsg"); + return \-1; + } + if (ret == 0) + return 0; + + const struct nlmsghdr *h = (struct nlmsghdr *) buf; + + if (!NLMSG_OK(h, ret)) { + fputs("!NLMSG_OK\\n", stderr); + return \-1; + } + for (; NLMSG_OK(h, ret); h = NLMSG_NEXT(h, ret)) { + if (h\->nlmsg_type == NLMSG_DONE) + return 0; + if (h\->nlmsg_type == NLMSG_ERROR) { + const struct nlmsgerr *err = NLMSG_DATA(h); + + if (h\->nlmsg_len < NLMSG_LENGTH(sizeof(*err))) { + fputs("NLMSG_ERROR\\n", stderr); + } else { + errno = \-err\->error; + perror("NLMSG_ERROR"); + } + return \-1; + } + if (h\->nlmsg_type != SOCK_DIAG_BY_FAMILY) { + fprintf(stderr, "unexpected nlmsg_type %u\\n", + (unsigned) h\->nlmsg_type); + return \-1; + } + + if (print_diag(NLMSG_DATA(h), h\->nlmsg_len)) + return \-1; + } + } +} + +int +main(void) +{ + int fd = socket(AF_NETLINK, SOCK_RAW, NETLINK_SOCK_DIAG); + + if (fd < 0) { + perror("socket"); + return 1; + } + + int ret = send_query(fd) || receive_responses(fd); + + close(fd); + return ret; +} +.fi +.SH SEE ALSO +.BR netlink (3), +.BR rtnetlink (3), +.BR netlink (7), +.BR tcp (7) -- ldv -- To unsubscribe from this list: send the line "unsubscribe linux-man" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html