Re: [PATCH 1/2] netfilter: add extended accounting infrastructure over nfnetlink

Patrick McHardy <kaber@xxxxxxxxx> · Wed, 14 Dec 2011 12:23:21 +0100

On 12/14/2011 12:00 PM, pablo@xxxxxxxxxxxxx wrote:

In order to manipulate create accounting objects, you require the
new libnetfilter_acct library. It contains several examples of use:

libnetfilter_acct/examples# ./nfacct-add http-traffic
libnetfilter_acct/examples# ./nfacct-get
http-traffic = { pkts = 000000000000,   bytes = 000000000000 };

Then, you can use one of this accounting objects in several iptables
rules using the new NFACCT target (which comes in a follow-up patch):

  # iptables -I INPUT -p tcp --sport 80 -j NFACCT --nfacct-name http-traffic
  # iptables -I OUTPUT -p tcp --dport 80 -j NFACCT --nfacct-name http-traffic

The idea is simple: if one packet matches the rule, the NFACCT target
updates the counters.

I like the idea.

diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
new file mode 100644
index 0000000..3ec407f
--- /dev/null
+++ b/net/netfilter/nfnetlink_acct.c
@@ -0,0 +1,352 @@
+/*
+ * (C) 2011 Pablo Neira Ayuso<pablo@xxxxxxxxxxxxx>
+ * (C) 2011 Intra2net AG<http://www.intra2net.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation (or any later at your option).
+ */
+#include<linux/init.h>
+#include<linux/module.h>
+#include<linux/kernel.h>
+#include<linux/skbuff.h>
+#include<linux/netlink.h>
+#include<linux/rculist.h>
+#include<linux/slab.h>
+#include<linux/types.h>
+#include<linux/errno.h>
+#include<net/netlink.h>
+#include<net/sock.h>
+#include<asm/atomic.h>
+
+#include<linux/netfilter.h>
+#include<linux/netfilter/nfnetlink.h>
+#include<linux/netfilter/nfnetlink_acct.h>
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso<pablo@xxxxxxxxxxxxx>");
+MODULE_DESCRIPTION("nfacct: Extended Netfilter accounting infrastructure");
+
+static LIST_HEAD(nfnl_acct_list);
+
+struct nf_acct {
+	struct rcu_head		rcu_head;
+	struct list_head	head;
+	spinlock_t		lock;	/* to update the counters. */
+	atomic_t		refcnt;
+
+	char			name[NFACCT_NAME_MAX];
+	__u64			pkts;
+	__u64			bytes;
+};
+
+static void nfnl_acct_free_rcu(struct rcu_head *rcu_head)
+{
+	struct nf_acct *acct = container_of(rcu_head, struct nf_acct, rcu_head);
+
+	kfree(acct);
+}
+
+static int
+nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
+	     const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+	int ret;
+	struct nf_acct *acct, *matching = NULL;
+	char *acct_name;
+
+	if (!tb[NFACCT_NAME])
+		return -EINVAL;
+
+	acct_name = nla_data(tb[NFACCT_NAME]);
+
+	rcu_read_lock();
+	list_for_each_entry(acct,&nfnl_acct_list, head) {

I don't really get the locking concept. All netlink operations happen under
the nfnl mutex, so using RCU for the lookup shouldn't be necessary
(applied to all netlink operations).

+		if (strncmp(acct->name, acct_name, NFACCT_NAME_MAX) != 0)
+			continue;
+
+                if (nlh->nlmsg_flags&  NLM_F_EXCL) {
+			rcu_read_unlock();
+			ret = -EEXIST;
+			goto err;
+		}
+		matching = acct;

break?
+        }
+	rcu_read_unlock();
+
+	acct = kzalloc(sizeof(struct nf_acct), GFP_KERNEL);
+	if (acct == NULL) {
+		ret = -ENOMEM;
+		goto err;
+	}
+	spin_lock_init(&acct->lock);
+	strncpy(acct->name, nla_data(tb[NFACCT_NAME]), NFACCT_NAME_MAX);
+	if (tb[NFACCT_BYTES])
+		acct->bytes = be64_to_cpu(nla_get_u64(tb[NFACCT_BYTES]));
+	if (tb[NFACCT_PKTS])
+		acct->pkts = be64_to_cpu(nla_get_u64(tb[NFACCT_PKTS]));
+
+	atomic_inc(&acct->refcnt);
+
+	/* We are protected by nfnl mutex. */
+	if (matching) {

This seems to be a replace operation, so I think you should
require NLM_F_REPLACE. Also it seems you could just
reinitialize the existing counter instead of unconditionally
allocating a new one.

+		list_del_rcu(&matching->head);
+		if (atomic_dec_and_test(&matching->refcnt))
+			call_rcu(&matching->rcu_head, nfnl_acct_free_rcu);
+
+	}
+	list_add_tail_rcu(&acct->head,&nfnl_acct_list);
+	return 0;
+err:
+	return ret;
+}
+
+static int
+nfnl_acct_fill_info(struct sk_buff *skb, u32 pid, u32 seq,
+		   int event, struct nf_acct *acct)
+{
+	struct nlmsghdr *nlh;
+	struct nfgenmsg *nfmsg;
+	unsigned int flags = pid ? NLM_F_MULTI : 0;
+
+	event |= NFNL_SUBSYS_ACCT<<  8;
+	nlh = nlmsg_put(skb, pid, seq, event, sizeof(*nfmsg), flags);
+	if (nlh == NULL)
+		goto nlmsg_failure;
+
+	nfmsg = nlmsg_data(nlh);
+	nfmsg->nfgen_family = AF_UNSPEC;
+	nfmsg->version = NFNETLINK_V0;
+	nfmsg->res_id = 0;
+
+	NLA_PUT_STRING(skb, NFACCT_NAME, acct->name);
+	NLA_PUT_BE64(skb, NFACCT_PKTS, cpu_to_be64(acct->pkts));
+	NLA_PUT_BE64(skb, NFACCT_BYTES, cpu_to_be64(acct->bytes));
+
+	nlmsg_end(skb, nlh);
+	return skb->len;
+
+nlmsg_failure:
+nla_put_failure:
+	nlmsg_cancel(skb, nlh);
+	return -1;
+}
+
+static int
+nfnl_acct_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+	struct nf_acct *cur, *last;
+
+	if (cb->args[2])
+		return 0;
+
+	last = (struct nf_acct *)cb->args[1];
+	if (cb->args[1])
+		cb->args[1] = 0;
+
+	rcu_read_lock();
+	list_for_each_entry(cur,&nfnl_acct_list, head) {
+		if (last&&  cur != last)
+			continue;
+
+		if (nfnl_acct_fill_info(skb, NETLINK_CB(cb->skb).pid,
+				       cb->nlh->nlmsg_seq,
+				       NFNL_MSG_ACCT_NEW, cur)<  0) {
+			cb->args[1] = (unsigned long)cur;
+			break;
+		}
+
+		if (NFNL_MSG_TYPE(cb->nlh->nlmsg_type) ==
+						NFNL_MSG_ACCT_GET_CTRZERO) {
+			spin_lock_bh(&cur->lock);
+			cur->pkts = 0;
+			cur->bytes = 0;
+			spin_unlock_bh(&cur->lock);
+		}
+	}
+	if (!cb->args[1])
+		cb->args[2] = 1;
+	rcu_read_unlock();
+	return skb->len;
+}
+
+static int
+nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb,
+	     const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+	int ret = 0;
+	struct nf_acct *cur;
+	char *acct_name;
+
+	if (nlh->nlmsg_flags&  NLM_F_DUMP) {
+		return netlink_dump_start(nfnl, skb, nlh, nfnl_acct_dump,
+					  NULL, 0);
+	}
+
+	if (!tb[NFACCT_NAME])
+		return -EINVAL;
+	acct_name = nla_data(tb[NFACCT_NAME]);
+
+	rcu_read_lock();
+	list_for_each_entry(cur,&nfnl_acct_list, head) {
+		struct sk_buff *skb2;
+
+		if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
+			continue;
+
+		skb2 = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC);
+		if (skb2 == NULL)
+			break;
+
+		ret = nfnl_acct_fill_info(skb2, NETLINK_CB(skb).pid,
+					 nlh->nlmsg_seq,
+					 NFNL_MSG_ACCT_NEW, cur);
+		if (ret<= 0)
+			kfree_skb(skb2);
+
+		if (NFNL_MSG_TYPE(nlh->nlmsg_type) ==
+						NFNL_MSG_ACCT_GET_CTRZERO) {
+			spin_lock_bh(&cur->lock);
+			cur->pkts = 0;
+			cur->bytes = 0;
+			spin_unlock_bh(&cur->lock);
+		}
+		break;
+	}
+	rcu_read_unlock();
+	return ret;
+}
+
+static int
+nfnl_acct_del(struct sock *nfnl, struct sk_buff *skb,
+	     const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+{
+	char *acct_name;
+	struct nf_acct *cur;
+	int ret = -ENOENT;
+
+	if (!tb[NFACCT_NAME]) {
+		rcu_read_lock();
+		list_for_each_entry(cur,&nfnl_acct_list, head) {
+			/* We are protected by nfnl mutex. */
+			list_del_rcu(&cur->head);
+			if (atomic_dec_and_test(&cur->refcnt))
+				call_rcu(&cur->rcu_head, nfnl_acct_free_rcu);

I think its strange to keep the object around after deletion if
it is still in use. In case it is still in use, I'd return -EBUSY.

+		}
+		rcu_read_lock();
+		return 0;
+	}
+	acct_name = nla_data(tb[NFACCT_NAME]);
+
+	rcu_read_lock();
+	list_for_each_entry(cur,&nfnl_acct_list, head) {
+		if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX) != 0)
+			continue;
+
+		/* We are protected by nfnl mutex. */
+		list_del_rcu(&cur->head);
+		if (atomic_dec_and_test(&cur->refcnt))
+			call_rcu(&cur->rcu_head, nfnl_acct_free_rcu);
+		ret = 0;
+		break;
+	}
+	rcu_read_lock();
+	return ret;
+}
+
+static const struct nla_policy nfnl_acct_policy[NFACCT_MAX+1] = {
+	[NFACCT_NAME] = { .type = NLA_NUL_STRING, .len = NFACCT_NAME_MAX-1 },
+	[NFACCT_BYTES] = { .type = NLA_U64 },
+	[NFACCT_PKTS] = { .type = NLA_U64 },
+};
+
+static const struct nfnl_callback nfnl_acct_cb[NFNL_MSG_ACCT_MAX] = {
+	[NFNL_MSG_ACCT_NEW]		= { .call = nfnl_acct_new,
+					    .attr_count = NFACCT_MAX,
+					    .policy = nfnl_acct_policy },
+	[NFNL_MSG_ACCT_GET] 		= { .call = nfnl_acct_get,
+					    .attr_count = NFACCT_MAX,
+					    .policy = nfnl_acct_policy },
+	[NFNL_MSG_ACCT_GET_CTRZERO] 	= { .call = nfnl_acct_get,
+					    .attr_count = NFACCT_MAX,
+					    .policy = nfnl_acct_policy },
+	[NFNL_MSG_ACCT_DEL]		= { .call = nfnl_acct_del,
+					    .attr_count = NFACCT_MAX,
+					    .policy = nfnl_acct_policy },
+};
+
+static const struct nfnetlink_subsystem nfnl_acct_subsys = {
+	.name				= "acct",
+	.subsys_id			= NFNL_SUBSYS_ACCT,
+	.cb_count			= NFNL_MSG_ACCT_MAX,
+	.cb				= nfnl_acct_cb,
+};
+
+MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_ACCT);
+
+struct nf_acct *nfnl_acct_find_get(const char *acct_name)
+{
+	struct nf_acct *cur, *acct = NULL;
+
+	rcu_read_lock();
+	list_for_each_entry(cur,&nfnl_acct_list, head) {
+		if (strncmp(cur->name, acct_name, NFACCT_NAME_MAX)!= 0)
+			continue;
+
+		acct = cur;
+		atomic_inc(&acct->refcnt);

This probably needs atomic_inc_not_zero() since the
lookup might race with deletion.

+		break;
+	}
+	rcu_read_unlock();
+	return acct;
+}
+EXPORT_SYMBOL_GPL(nfnl_acct_find_get);
+
+void nfnl_acct_put(struct nf_acct *acct)
+{
+	if (atomic_dec_and_test(&acct->refcnt))
+		call_rcu(&acct->rcu_head, nfnl_acct_free_rcu);
+}
+EXPORT_SYMBOL_GPL(nfnl_acct_put);
+
+void nfnl_acct_update(const struct sk_buff *skb, struct nf_acct *nfacct)
+{
+	spin_lock_bh(&nfacct->lock);
+	nfacct->pkts++;
+	nfacct->bytes += skb->len;
+	spin_unlock_bh(&nfacct->lock);
+}
+EXPORT_SYMBOL_GPL(nfnl_acct_update);
+
+static int __init nfnl_acct_init(void)
+{
+	int ret;
+
+	pr_info("nfnl_acct: registering with nfnetlink.\n");
+	ret = nfnetlink_subsys_register(&nfnl_acct_subsys);
+	if (ret<  0) {
+		pr_err("nfnl_acct_init: cannot register with nfnetlink.\n");
+		goto err_out;
+	}
+	return 0;
+err_out:
+	return ret;
+}
+
+static void __exit nfnl_acct_exit(void)
+{
+	struct nf_acct *cur, *tmp;
+
+	pr_info("nfnl_acct: unregistering from nfnetlink.\n");
+	nfnetlink_subsys_unregister(&nfnl_acct_subsys);
+
+	/* if we can remove this module, it means that it has no clients. */
+	list_for_each_entry_safe(cur, tmp,&nfnl_acct_list, head) {
+		list_del_rcu(&cur->head);
+		if (atomic_dec_and_test(&cur->refcnt))
+			kfree(cur);

What happens if it is non-zero? The iptables target should
take a module reference as long as its using objects that
this module is responsible for managing.

+	}
+}
+
+module_init(nfnl_acct_init);
+module_exit(nfnl_acct_exit);

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html