Le samedi 24 décembre 2011 à 02:25 +0100, pablo@xxxxxxxxxxxxx a écrit : > From: Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> > > We currently have two ways to account traffic in netfilter: > > - iptables chain and rule counters: > > # iptables -L -n -v > Chain INPUT (policy DROP 3 packets, 867 bytes) > pkts bytes target prot opt in out source destination > 8 1104 ACCEPT all -- lo * 0.0.0.0/0 0.0.0.0/0 > > - use flow-based accounting provided by ctnetlink: > > # conntrack -L > tcp 6 431999 ESTABLISHED src=192.168.1.130 dst=212.106.219.168 sport=58152 dport=80 packets=47 bytes=7654 src=212.106.219.168 dst=192.168.1.130 sport=80 dport=58152 packets=49 bytes=66340 [ASSURED] mark=0 use=1 > > While trying to display real-time accounting statistics, we require > to pool the kernel periodically to obtain this information. This is > OK if the number of flows is relatively low. However, in case that > the number of flows is huge, we can spend a considerable amount of > cycles to iterate over the list of flows that have been obtained. > > Moreover, if we want to obtain the sum of the flow accounting results > that match some criteria, we have to iterate over the whole list of > existing flows, look for matchings and update the counters. > > This patch adds the extended accounting infrastructure for > nfnetlink which aims to allow displaying real-time traffic accounting > without the need of complicated and resource-consuming implementation > in user-space. Basically, this new infrastructure allows you to create > accounting objects. One accounting object is composed of packet and > byte counters. > > In order to manipulate create accounting objects, you require the > new libnetfilter_acct library. It contains several examples of use: > > libnetfilter_acct/examples# ./nfacct-add http-traffic > libnetfilter_acct/examples# ./nfacct-get > http-traffic = { pkts = 000000000000, bytes = 000000000000 }; > > Then, you can use one of this accounting objects in several iptables > rules using the new nfacct match (which comes in a follow-up patch): > > # iptables -I INPUT -p tcp --sport 80 -m nfacct --nfacct-name http-traffic > # iptables -I OUTPUT -p tcp --dport 80 -m nfacct --nfacct-name http-traffic > > The idea is simple: if one packet matches the rule, the nfacct match > updates the counters. > > Thanks to Patrick McHardy and Eric Dumazet for reviewing and providing > feedback for this contribution. > > Signed-off-by: Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> > --- > include/linux/netfilter/Kbuild | 1 + > include/linux/netfilter/nfnetlink.h | 3 +- > include/linux/netfilter/nfnetlink_acct.h | 36 +++ > net/netfilter/Kconfig | 8 + > net/netfilter/Makefile | 1 + > net/netfilter/nfnetlink_acct.c | 361 ++++++++++++++++++++++++++++++ > 6 files changed, 409 insertions(+), 1 deletions(-) > create mode 100644 include/linux/netfilter/nfnetlink_acct.h > create mode 100644 net/netfilter/nfnetlink_acct.c > > diff --git a/include/linux/netfilter/Kbuild b/include/linux/netfilter/Kbuild > index d81f771..6785246 100644 > --- a/include/linux/netfilter/Kbuild > +++ b/include/linux/netfilter/Kbuild > @@ -7,6 +7,7 @@ header-y += nf_conntrack_tcp.h > header-y += nf_conntrack_tuple_common.h > header-y += nf_nat.h > header-y += nfnetlink.h > +header-y += nfnetlink_acct.h > header-y += nfnetlink_compat.h > header-y += nfnetlink_conntrack.h > header-y += nfnetlink_log.h > diff --git a/include/linux/netfilter/nfnetlink.h b/include/linux/netfilter/nfnetlink.h > index 74d3386..b64454c 100644 > --- a/include/linux/netfilter/nfnetlink.h > +++ b/include/linux/netfilter/nfnetlink.h > @@ -48,7 +48,8 @@ struct nfgenmsg { > #define NFNL_SUBSYS_ULOG 4 > #define NFNL_SUBSYS_OSF 5 > #define NFNL_SUBSYS_IPSET 6 > -#define NFNL_SUBSYS_COUNT 7 > +#define NFNL_SUBSYS_ACCT 7 > +#define NFNL_SUBSYS_COUNT 8 > > #ifdef __KERNEL__ > > diff --git a/include/linux/netfilter/nfnetlink_acct.h b/include/linux/netfilter/nfnetlink_acct.h > new file mode 100644 > index 0000000..7c4279b > --- /dev/null > +++ b/include/linux/netfilter/nfnetlink_acct.h > @@ -0,0 +1,36 @@ > +#ifndef _NFNL_ACCT_H_ > +#define _NFNL_ACCT_H_ > + > +#ifndef NFACCT_NAME_MAX > +#define NFACCT_NAME_MAX 32 > +#endif > + > +enum nfnl_acct_msg_types { > + NFNL_MSG_ACCT_NEW, > + NFNL_MSG_ACCT_GET, > + NFNL_MSG_ACCT_GET_CTRZERO, > + NFNL_MSG_ACCT_DEL, > + NFNL_MSG_ACCT_MAX > +}; > + > +enum nfnl_acct_type { > + NFACCT_UNSPEC, > + NFACCT_NAME, > + NFACCT_PKTS, > + NFACCT_BYTES, > + NFACCT_USE, > + __NFACCT_MAX > +}; > +#define NFACCT_MAX (__NFACCT_MAX - 1) > + > +#ifdef __KERNEL__ > + > +struct nf_acct; > + > +extern struct nf_acct *nfnl_acct_find_get(const char *filter_name); > +extern void nfnl_acct_put(struct nf_acct *acct); > +extern void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct); > + > +#endif /* __KERNEL__ */ > + > +#endif /* _NFNL_ACCT_H */ > diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig > index d5597b7..77326ac 100644 > --- a/net/netfilter/Kconfig > +++ b/net/netfilter/Kconfig > @@ -4,6 +4,14 @@ menu "Core Netfilter Configuration" > config NETFILTER_NETLINK > tristate > > +config NETFILTER_NETLINK_ACCT > +tristate "Netfilter NFACCT over NFNETLINK interface" > + depends on NETFILTER_ADVANCED > + select NETFILTER_NETLINK > + help > + If this option is enabled, the kernel will include support > + for extended accounting via NFNETLINK. > + > config NETFILTER_NETLINK_QUEUE > tristate "Netfilter NFQUEUE over NFNETLINK interface" > depends on NETFILTER_ADVANCED > diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile > index 1a02853..4da1c87 100644 > --- a/net/netfilter/Makefile > +++ b/net/netfilter/Makefile > @@ -7,6 +7,7 @@ nf_conntrack-$(CONFIG_NF_CONNTRACK_EVENTS) += nf_conntrack_ecache.o > obj-$(CONFIG_NETFILTER) = netfilter.o > > obj-$(CONFIG_NETFILTER_NETLINK) += nfnetlink.o > +obj-$(CONFIG_NETFILTER_NETLINK_ACCT) += nfnetlink_acct.o > obj-$(CONFIG_NETFILTER_NETLINK_QUEUE) += nfnetlink_queue.o > obj-$(CONFIG_NETFILTER_NETLINK_LOG) += nfnetlink_log.o > > diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c > new file mode 100644 > index 0000000..8de5efa > --- /dev/null > +++ b/net/netfilter/nfnetlink_acct.c > @@ -0,0 +1,361 @@ > +/* > + * (C) 2011 Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> > + * (C) 2011 Intra2net AG <http://www.intra2net.com> > + * > + * This program is free software; you can redistribute it and/or modify > + * it under the terms of the GNU General Public License version 2 as > + * published by the Free Software Foundation (or any later at your option). > + */ > +#include <linux/init.h> > +#include <linux/module.h> > +#include <linux/kernel.h> > +#include <linux/skbuff.h> > +#include <linux/netlink.h> > +#include <linux/rculist.h> > +#include <linux/slab.h> > +#include <linux/types.h> > +#include <linux/errno.h> > +#include <net/netlink.h> > +#include <net/sock.h> > +#include <asm/atomic.h> > + > +#include <linux/netfilter.h> > +#include <linux/netfilter/nfnetlink.h> > +#include <linux/netfilter/nfnetlink_acct.h> > + > +MODULE_LICENSE("GPL"); > +MODULE_AUTHOR("Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx>"); > +MODULE_DESCRIPTION("nfacct: Extended Netfilter accounting infrastructure"); > + > +static LIST_HEAD(nfnl_acct_list); > + > +struct nf_acct { > + struct list_head head; > + atomic_t refcnt; > + > + char name[NFACCT_NAME_MAX]; > + atomic64_t pkts; > + atomic64_t bytes; > + > + struct rcu_head rcu_head; > +}; On 64bit arch : offsetof(struct nf_acct, pkts) = 0x38 offsetof(struct nf_acct, pkts) = 0x40 different cache lines, so a match will dirty two cache lines. So please use a different layout, for example : struct nf_acct { atomic64_t pkts; atomic64_t bytes; struct list_head head; atomic_t refcnt; char name[NFACCT_NAME_MAX]; struct rcu_head rcu_head; }; > + > +static void nfnl_acct_free_rcu(struct rcu_head *rcu_head) > +{ > + struct nf_acct *acct = container_of(rcu_head, struct nf_acct, rcu_head); > + > + kfree(acct); > +} You dont need this wrapper if you use kfree_rcu() > + > +static int > +nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb, > + const struct nlmsghdr *nlh, const struct nlattr * const tb[]) > +{ > + struct nf_acct *nfacct, *matching = NULL; > + char *acct_name; > + > + if (!tb[NFACCT_NAME]) > + return -EINVAL; > + > + acct_name = nla_data(tb[NFACCT_NAME]); > + > + list_for_each_entry(nfacct, &nfnl_acct_list, head) { > + if (strncmp(nfacct->name, acct_name, NFACCT_NAME_MAX) != 0) > + continue; > + > + if (nlh->nlmsg_flags & NLM_F_EXCL) > + return -EEXIST; > + > + matching = nfacct; > + break; > + } > + > + if (matching) { > + if (nlh->nlmsg_flags & NLM_F_REPLACE) { > + /* reset counters if you request a replacement. */ > + atomic64_set(&matching->pkts, 0); > + atomic64_set(&matching->bytes, 0); > + return 0; > + } > + return -EBUSY; extra tabulation before "return -EBUSY" > + } > + > + nfacct = kzalloc(sizeof(struct nf_acct), GFP_KERNEL); > + if (nfacct == NULL) > + return -ENOMEM; > + > + strncpy(nfacct->name, nla_data(tb[NFACCT_NAME]), NFACCT_NAME_MAX); > + > + if (tb[NFACCT_BYTES]) { > + atomic64_set(&nfacct->bytes, > + be64_to_cpu(nla_get_u64(tb[NFACCT_BYTES]))); > + } > + if (tb[NFACCT_PKTS]) { > + atomic64_set(&nfacct->pkts, > + be64_to_cpu(nla_get_u64(tb[NFACCT_PKTS]))); > + } > + atomic_inc(&nfacct->refcnt); could be atomic_set() > + list_add_tail_rcu(&nfacct->head, &nfnl_acct_list); > + return 0; > +} > + > +static int > +nfnl_acct_fill_info(struct sk_buff *skb, u32 pid, u32 seq, u32 type, > + int event, struct nf_acct *acct) > +{ > + struct nlmsghdr *nlh; > + struct nfgenmsg *nfmsg; > + unsigned int flags = pid ? NLM_F_MULTI : 0; > + u64 pkts, bytes; > + > + event |= NFNL_SUBSYS_ACCT << 8; > + nlh = nlmsg_put(skb, pid, seq, event, sizeof(*nfmsg), flags); > + if (nlh == NULL) > + goto nlmsg_failure; > + > + nfmsg = nlmsg_data(nlh); > + nfmsg->nfgen_family = AF_UNSPEC; > + nfmsg->version = NFNETLINK_V0; > + nfmsg->res_id = 0; > + > + NLA_PUT_STRING(skb, NFACCT_NAME, acct->name); > + > + if (type == NFNL_MSG_ACCT_GET_CTRZERO) { > + pkts = atomic64_xchg(&acct->pkts, 0); > + bytes = atomic64_xchg(&acct->bytes, 0); > + } else { > + pkts = atomic64_read(&acct->pkts); > + bytes = atomic64_read(&acct->bytes); > + } > + NLA_PUT_BE64(skb, NFACCT_PKTS, cpu_to_be64(pkts)); > + NLA_PUT_BE64(skb, NFACCT_BYTES, cpu_to_be64(bytes)); > + NLA_PUT_BE32(skb, NFACCT_USE, htonl(atomic_read(&acct->refcnt))); > + > + nlmsg_end(skb, nlh); > + return skb->len; > + > +nlmsg_failure: > +nla_put_failure: > + nlmsg_cancel(skb, nlh); > + return -1; > +} > + > +static int > +nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb) > +{ > + struct nf_acct *cur, *last; > + > + if (cb->args[2]) > + return 0; > + > + last = (struct nf_acct *)cb->args[1]; > + if (cb->args[1]) > + cb->args[1] = 0; > + > + rcu_read_lock(); > + list_for_each_entry_rcu(cur, &nfnl_acct_list, head) { > + if (last && cur != last) > + continue; > + > + if (nfnl_acct_fill_info(skb, NETLINK_CB(cb->skb).pid, > + cb->nlh->nlmsg_seq, > + NFNL_MSG_TYPE(cb->nlh->nlmsg_type), > + NFNL_MSG_ACCT_NEW, cur) < 0) { > + cb->args[1] = (unsigned long)cur; > + break; > + } > + } > + if (!cb->args[1]) > + cb->args[2] = 1; > + rcu_read_unlock(); > + return skb->len; > +} > + > +static int > +nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb, > + const struct nlmsghdr *nlh, const struct nlattr * const tb[]) > +{ > + int ret = 0; > + struct nf_acct *cur; > + char *acct_name; > + > + if (nlh->nlmsg_flags & NLM_F_DUMP) { > + return netlink_dump_start(nfnl, skb, nlh, nfnl_acct_dump, > + NULL, 0); > + } > + > + if (!tb[NFACCT_NAME]) > + return -EINVAL; > + acct_name = nla_data(tb[NFACCT_NAME]); > + > + list_for_each_entry(cur, &nfnl_acct_list, head) { > + struct sk_buff *skb2; > + > + if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) > + continue; > + > + skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); > + if (skb2 == NULL) > + break; > + > + ret = nfnl_acct_fill_info(skb2, NETLINK_CB(skb).pid, > + nlh->nlmsg_seq, > + NFNL_MSG_TYPE(nlh->nlmsg_type), > + NFNL_MSG_ACCT_NEW, cur); > + if (ret <= 0) > + kfree_skb(skb2); > + > + break; > + } > + return ret; > +} > + > +/* try to delete object, fail if it is still in use. */ > +static int nfnl_acct_try_del(struct nf_acct *cur) > +{ > + int ret = 0; > + > + /* we want to avoid races with nfnl_acct_find_get. */ > + if (atomic_dec_and_test(&cur->refcnt)) { > + /* We are protected by nfnl mutex. */ > + list_del_rcu(&cur->head); > + call_rcu(&cur->rcu_head, nfnl_acct_free_rcu); kfree_rcu(cur, rcu_head) > + } else { > + /* still in use, restore reference counter. */ > + atomic_inc(&cur->refcnt); > + ret = -EBUSY; > + } > + return ret; > +} > + > +static int > +nfnl_acct_del(struct sock *nfnl, struct sk_buff *skb, > + const struct nlmsghdr *nlh, const struct nlattr * const tb[]) > +{ > + char *acct_name; > + struct nf_acct *cur; > + int ret = -ENOENT; > + > + if (!tb[NFACCT_NAME]) { > + list_for_each_entry(cur, &nfnl_acct_list, head) > + nfnl_acct_try_del(cur); > + > + return 0; > + } > + acct_name = nla_data(tb[NFACCT_NAME]); > + > + list_for_each_entry(cur, &nfnl_acct_list, head) { > + if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0) > + continue; > + > + ret = nfnl_acct_try_del(cur); > + if (ret < 0) > + return ret; > + > + break; > + } > + return ret; > +} > + > +static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = { > + [NFACCT_NAME] = { .type = NLA_NUL_STRING, .len = NFACCT_NAME_MAX-1 }, > + [NFACCT_BYTES] = { .type = NLA_U64 }, > + [NFACCT_PKTS] = { .type = NLA_U64 }, > +}; > + > +static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = { > + [NFNL_MSG_ACCT_NEW] = { .call = nfnl_acct_new, > + .attr_count = NFACCT_MAX, > + .policy = nfnl_acct_policy }, > + [NFNL_MSG_ACCT_GET] = { .call = nfnl_acct_get, > + .attr_count = NFACCT_MAX, > + .policy = nfnl_acct_policy }, > + [NFNL_MSG_ACCT_GET_CTRZERO] = { .call = nfnl_acct_get, > + .attr_count = NFACCT_MAX, > + .policy = nfnl_acct_policy }, > + [NFNL_MSG_ACCT_DEL] = { .call = nfnl_acct_del, > + .attr_count = NFACCT_MAX, > + .policy = nfnl_acct_policy }, > +}; > + > +static const struct nfnetlink_subsystem nfnl_acct_subsys = { > + .name = "acct", > + .subsys_id = NFNL_SUBSYS_ACCT, > + .cb_count = NFNL_MSG_ACCT_MAX, > + .cb = nfnl_acct_cb, > +}; > + > +MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT); > + > +struct nf_acct *nfnl_acct_find_get(const char *acct_name) > +{ > + struct nf_acct *cur, *acct = NULL; > + > + rcu_read_lock(); > + list_for_each_entry_rcu(cur, &nfnl_acct_list, head) { > + if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0) > + continue; > + > + if (!try_module_get(THIS_MODULE)) > + goto err; > + > + if (!atomic_inc_not_zero(&cur->refcnt)) { > + module_put(THIS_MODULE); > + goto err; > + } > + > + acct = cur; > + break; > + } > + rcu_read_unlock(); > +err: > + return acct; > +} > +EXPORT_SYMBOL_GPL(nfnl_acct_find_get); > + > +void nfnl_acct_put(struct nf_acct *acct) > +{ > + atomic_dec(&acct->refcnt); > + module_put(THIS_MODULE); > +} > +EXPORT_SYMBOL_GPL(nfnl_acct_put); > + > +void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct) > +{ > + atomic64_inc(&nfacct->pkts); > + atomic64_add(skb->len, &nfacct->bytes); > +} > +EXPORT_SYMBOL_GPL(nfnl_acct_update); > + > +static int __init nfnl_acct_init(void) > +{ > + int ret; > + > + pr_info("nfnl_acct: registering with nfnetlink.\n"); > + ret = nfnetlink_subsys_register(&nfnl_acct_subsys); > + if (ret < 0) { > + pr_err("nfnl_acct_init: cannot register with nfnetlink.\n"); > + goto err_out; > + } > + return 0; > +err_out: > + return ret; > +} > + > +static void __exit nfnl_acct_exit(void) > +{ > + struct nf_acct *cur, *tmp; > + > + pr_info("nfnl_acct: unregistering from nfnetlink.\n"); > + nfnetlink_subsys_unregister(&nfnl_acct_subsys); > + > + list_for_each_entry_safe(cur, tmp, &nfnl_acct_list, head) { > + list_del_rcu(&cur->head); > + /* We are sure that our objects have no clients at this point, > + * it's safe to release them all without checking refcnt. */ > + kfree(cur); Ok for the refcnt, but not sure why rcu grace period is not needed ? Anyway, you can use kfree_rcu() here and we all can sleep well. > + } > +} > + > +module_init(nfnl_acct_init); > +module_exit(nfnl_acct_exit); -- To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html