Sorry, wrong description + patch. Let's start again. [PATCH] rework NFCT to use a generic hashtable This patch introduces a generic hashtable to store the nf_conntrack objects. The objects are identified by the original and reply tuples instead of the conntrack ID which is not dumped in the event message of linux kernel < 2.6.25. This patch also fixes the NFCT_MSG_* by NFCT_T_* which is the appropriate message type tag. Signed-off-by: Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> -- "Los honestos son inadaptados sociales" -- Les Luthiers
[PATCH] rework NFCT to use a generic hashtable This patch introduces a generic hashtable to store the nf_conntrack objects. The objects are identified by the original and reply tuples instead of the conntrack ID which is not dumped in the event message of linux kernel < 2.6.25. This patch also fixes the NFCT_MSG_* by NFCT_T_* which is the appropriate message type tag. Signed-off-by: Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> Index: ulogd2/input/flow/ulogd_inpflow_NFCT.c =================================================================== --- ulogd2.orig/input/flow/ulogd_inpflow_NFCT.c 2008-05-14 13:46:34.000000000 +0200 +++ ulogd2/input/flow/ulogd_inpflow_NFCT.c 2008-05-14 13:48:10.000000000 +0200 @@ -13,12 +13,13 @@ * Added timestamp accounting support of the conntrack entries, * reworked by Harald Welte. * + * 11 May 2008, Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> + * Use a generic hashtable to store the existing flows + * * TODO: * - add nanosecond-accurate packet receive timestamp of event-changing * packets to {ip,nf}_conntrack_netlink, so we can have accurate IPFIX * flowStart / flowEnd NanoSeconds. - * - if using preallocated data structure, get rid of all list heads and - * use per-bucket arrays instead. * - SIGHUP for reconfiguration without loosing hash table contents, but * re-read of config and reallocation / rehashing of table, if required * - Split hashtable code into separate [filter] plugin, so we can run @@ -34,6 +35,8 @@ #include <sys/time.h> #include <time.h> #include <ulogd/linuxlist.h> +#include <ulogd/jhash.h> +#include <ulogd/hash.h> #include <ulogd/ulogd.h> #include <ulogd/timer.h> @@ -44,24 +47,15 @@ typedef enum TIMES_ { START, STOP, __TIME_MAX } TIMES; struct ct_timestamp { - struct llist_head list; struct timeval time[__TIME_MAX]; - int id; -}; - -struct ct_htable { - struct llist_head *buckets; - int num_buckets; - int prealloc; - struct llist_head idle; - struct ct_timestamp *ts; + struct nf_conntrack *ct; }; struct nfct_pluginstance { struct nfct_handle *cth; struct ulogd_fd nfct_fd; struct ulogd_timer timer; - struct ct_htable *ct_active; + struct hashtable *ct_active; }; #define HTABLE_SIZE (8192) @@ -69,7 +63,7 @@ struct nfct_pluginstance { #define EVENT_MASK NF_NETLINK_CONNTRACK_NEW | NF_NETLINK_CONNTRACK_DESTROY static struct config_keyset nfct_kset = { - .num_ces = 6, + .num_ces = 5, .ces = { { .key = "pollinterval", @@ -84,12 +78,6 @@ static struct config_keyset nfct_kset = .u.value = 1, }, { - .key = "hash_prealloc", - .type = CONFIG_TYPE_INT, - .options = CONFIG_OPT_NONE, - .u.value = 1, - }, - { .key = "hash_buckets", .type = CONFIG_TYPE_INT, .options = CONFIG_OPT_NONE, @@ -112,10 +100,9 @@ static struct config_keyset nfct_kset = }; #define pollint_ce(x) (x->ces[0]) #define usehash_ce(x) (x->ces[1]) -#define prealloc_ce(x) (x->ces[2]) -#define buckets_ce(x) (x->ces[3]) -#define maxentries_ce(x) (x->ces[4]) -#define eventmask_ce(x) (x->ces[5]) +#define buckets_ce(x) (x->ces[2]) +#define maxentries_ce(x) (x->ces[3]) +#define eventmask_ce(x) (x->ces[4]) enum nfct_keys { NFCT_ORIG_IP_SADDR = 0, @@ -366,117 +353,68 @@ static struct ulogd_key nfct_okeys[] = { }, }; -static struct ct_htable *htable_alloc(int htable_size, int prealloc) +static uint32_t __hash4(const struct nf_conntrack *ct, struct hashtable *table) { - struct ct_htable *htable; - struct ct_timestamp *ct; - int i; - - htable = malloc(sizeof(*htable) - + sizeof(struct llist_head)*htable_size); - if (!htable) - return NULL; - - htable->buckets = (void *)htable + sizeof(*htable); - htable->num_buckets = htable_size; - htable->prealloc = prealloc; - INIT_LLIST_HEAD(&htable->idle); - - for (i = 0; i < htable->num_buckets; i++) - INIT_LLIST_HEAD(&htable->buckets[i]); - - if (!htable->prealloc) - return htable; - - ct = malloc(sizeof(struct ct_timestamp) - * htable->num_buckets * htable->prealloc); - if (!ct) { - free(htable); - return NULL; - } + unsigned int a, b; - /* save the pointer for later free()ing */ - htable->ts = ct; + a = jhash(nfct_get_attr(ct, ATTR_ORIG_IPV4_SRC), sizeof(uint32_t), + ((nfct_get_attr_u8(ct, ATTR_ORIG_L3PROTO) << 16) | + (nfct_get_attr_u8(ct, ATTR_ORIG_L4PROTO)))); - for (i = 0; i < htable->num_buckets * htable->prealloc; i++) - llist_add(&ct[i].list, &htable->idle); + b = jhash(nfct_get_attr(ct, ATTR_ORIG_IPV4_DST), sizeof(uint32_t), + ((nfct_get_attr_u16(ct, ATTR_ORIG_PORT_SRC) << 16) | + (nfct_get_attr_u16(ct, ATTR_ORIG_PORT_DST)))); - return htable; + /* + * Instead of returning hash % table->hashsize (implying a divide) + * we return the high 32 bits of the (hash * table->hashsize) that will + * give results between [0 and hashsize-1] and same hash distribution, + * but using a multiply, less expensive than a divide. See: + * http://www.mail-archive.com/netdev@xxxxxxxxxxxxxxx/msg56623.html + */ + return ((uint64_t)jhash_2words(a, b, 0) * table->hashsize) >> 32; } -static void htable_free(struct ct_htable *htable) +static uint32_t __hash6(const struct nf_conntrack *ct, struct hashtable *table) { - struct llist_head *ptr, *ptr2; - int i; + unsigned int a, b; - if (htable->prealloc) { - /* the easy case */ - free(htable->ts); - free(htable); + a = jhash(nfct_get_attr(ct, ATTR_ORIG_IPV6_SRC), sizeof(uint32_t)*4, + ((nfct_get_attr_u8(ct, ATTR_ORIG_L3PROTO) << 16) | + (nfct_get_attr_u8(ct, ATTR_ORIG_L4PROTO)))); - return; - } - - /* non-prealloc case */ - - for (i = 0; i < htable->num_buckets; i++) { - llist_for_each_safe(ptr, ptr2, &htable->buckets[i]) - free(container_of(ptr, struct ct_timestamp, list)); - } + b = jhash(nfct_get_attr(ct, ATTR_ORIG_IPV6_DST), sizeof(uint32_t)*4, + ((nfct_get_attr_u16(ct, ATTR_ORIG_PORT_SRC) << 16) | + (nfct_get_attr_u16(ct, ATTR_ORIG_PORT_DST)))); - /* don't need to check for 'idle' list, since it is only used in - * the preallocated case */ + return ((uint64_t)jhash_2words(a, b, 0) * table->hashsize) >> 32; } -static int ct_hash_add(struct ct_htable *htable, unsigned int id) +static uint32_t hash(const void *data, struct hashtable *table) { - struct ct_timestamp *ct; - - if (htable->prealloc) { - if (llist_empty(&htable->idle)) { - ulogd_log(ULOGD_ERROR, "Not enough ct_timestamp entries\n"); - return -1; - } - - ct = container_of(htable->idle.next, struct ct_timestamp, list); - - ct->id = id; - gettimeofday(&ct->time[START], NULL); - - llist_move(&ct->list, &htable->buckets[id % htable->num_buckets]); - } else { - ct = malloc(sizeof *ct); - if (!ct) { - ulogd_log(ULOGD_ERROR, "Not enough memory\n"); - return -1; - } - - ct->id = id; - gettimeofday(&ct->time[START], NULL); + int ret = 0; + const struct ct_timestamp *ts = data; - llist_add(&ct->list, &htable->buckets[id % htable->num_buckets]); + switch(nfct_get_attr_u8(ts->ct, ATTR_L3PROTO)) { + case AF_INET: + ret = __hash4(ts->ct, table); + break; + case AF_INET6: + ret = __hash6(ts->ct, table); + break; + default: + break; } - return 0; + return ret; } -static struct ct_timestamp *ct_hash_get(struct ct_htable *htable, uint32_t id) +static int compare(const void *data1, const void *data2) { - struct ct_timestamp *ct = NULL; - struct llist_head *ptr; + const struct ct_timestamp *u1 = data1; + const struct ct_timestamp *u2 = data2; - llist_for_each(ptr, &htable->buckets[id % htable->num_buckets]) { - ct = container_of(ptr, struct ct_timestamp, list); - if (ct->id == id) { - gettimeofday(&ct->time[STOP], NULL); - if (htable->prealloc) - llist_move(&ct->list, &htable->idle); - else - free(ct); - break; - } - } - return ct; + return nfct_cmp(u1->ct, u2->ct, NFCT_CMP_ORIG | NFCT_CMP_REPL); } static int propagate_ct(struct ulogd_pluginstance *upi, @@ -600,28 +538,57 @@ static int event_handler(enum nf_conntra struct nfct_pluginstance *cpi = (struct nfct_pluginstance *) upi->private; struct ct_timestamp *ts = NULL; + struct ct_timestamp tmp = { + .ct = ct, + }; struct ulogd_pluginstance *npi = NULL; int ret = 0; - if (type == NFCT_MSG_NEW) { - if (usehash_ce(upi->config_kset).u.value != 0) { - ct_hash_add(cpi->ct_active, nfct_get_attr_u32(ct, ATTR_ID)); - return 0; + if (usehash_ce(upi->config_kset).u.value == 0) + return NFCT_CB_CONTINUE; + + switch(type) { + case NFCT_T_NEW: + ts = hashtable_add(cpi->ct_active, &tmp); + gettimeofday(&ts->time[START], NULL); + return NFCT_CB_STOLEN; + case NFCT_T_UPDATE: + ts = hashtable_get(cpi->ct_active, &tmp); + if (ts) + nfct_copy(ts->ct, ct, NFCT_CP_META); + else { + ts = hashtable_add(cpi->ct_active, &tmp); + gettimeofday(&ts->time[START], NULL); + return NFCT_CB_STOLEN; } - } else if (type == NFCT_MSG_DESTROY) { - if (usehash_ce(upi->config_kset).u.value != 0) - ts = ct_hash_get(cpi->ct_active, nfct_get_attr_u32(ct, ATTR_ID)); - } + break; + case NFCT_T_DESTROY: + ts = hashtable_get(cpi->ct_active, &tmp); + if (ts) + gettimeofday(&ts->time[STOP], NULL); + + /* since we support the re-use of one instance in + * several different stacks, we duplicate the message + * to let them know */ + llist_for_each_entry(npi, &upi->plist, plist) { + ret = propagate_ct(npi, ct, type, ts); + if (ret != 0) + break; + } + + propagate_ct(upi, ct, type, ts); - /* since we support the re-use of one instance in - * several different stacks, we duplicate the message - * to let them know */ - llist_for_each_entry(npi, &upi->plist, plist) { - ret = propagate_ct(npi, ct, type, ts); - if (ret != 0) - return ret; + if (ts) { + hashtable_del(cpi->ct_active, ts); + free(ts->ct); + } + break; + default: + ulogd_log(ULOGD_NOTICE, "unknown netlink message type\n"); + break; } - return propagate_ct(upi, ct, type, ts); + + return NFCT_CB_CONTINUE; } static int read_cb_nfct(int fd, unsigned int what, void *param) @@ -677,7 +644,6 @@ static int constructor_nfct(struct ulogd { struct nfct_pluginstance *cpi = (struct nfct_pluginstance *)upi->private; - int prealloc; cpi->cth = nfct_open(NFNL_SUBSYS_CTNETLINK, eventmask_ce(upi->config_kset).u.value); @@ -695,15 +661,13 @@ static int constructor_nfct(struct ulogd ulogd_register_fd(&cpi->nfct_fd); - if (prealloc_ce(upi->config_kset).u.value != 0) - prealloc = maxentries_ce(upi->config_kset).u.value / - buckets_ce(upi->config_kset).u.value; - else - prealloc = 0; - if (usehash_ce(upi->config_kset).u.value != 0) { - cpi->ct_active = htable_alloc(buckets_ce(upi->config_kset).u.value, - prealloc); + cpi->ct_active = + hashtable_create(buckets_ce(upi->config_kset).u.value, + maxentries_ce(upi->config_kset).u.value, + sizeof(struct ct_timestamp), + hash, + compare); if (!cpi->ct_active) { ulogd_log(ULOGD_FATAL, "error allocating hash\n"); nfct_close(cpi->cth); @@ -719,7 +683,7 @@ static int destructor_nfct(struct ulogd_ struct nfct_pluginstance *cpi = (void *) pi; int rc; - htable_free(cpi->ct_active); + hashtable_destroy(cpi->ct_active); rc = nfct_close(cpi->cth); if (rc < 0) Index: ulogd2/src/hash.c =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ ulogd2/src/hash.c 2008-05-14 13:46:52.000000000 +0200 @@ -0,0 +1,193 @@ +/* + * (C) 2006-2007 by Pablo Neira Ayuso <pablo@xxxxxxxxxxxxx> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Description: generic hash table implementation + */ + +#include <ulogd/hash.h> +#include <ulogd/slist.h> + +#include <errno.h> +#include <stdlib.h> +#include <string.h> + +struct hashtable_node *hashtable_alloc_node(int datasize, void *data) +{ + struct hashtable_node *n; + int size = sizeof(struct hashtable_node) + datasize; + + n = malloc(size); + if (!n) + return NULL; + memset(n, 0, size); + memcpy(n->data, data, datasize); + + return n; +} + +void hashtable_destroy_node(struct hashtable_node *h) +{ + free(h); +} + +struct hashtable * +hashtable_create(int hashsize, int limit, int datasize, + uint32_t (*hash)(const void *data, struct hashtable *table), + int (*compare)(const void *data1, const void *data2)) +{ + int i; + struct hashtable *h; + int size = sizeof(struct hashtable) + + hashsize * sizeof(struct slist_head); + + h = (struct hashtable *) malloc(size); + if (!h) { + errno = ENOMEM; + return NULL; + } + + memset(h, 0, size); + for (i=0; i<hashsize; i++) + INIT_SLIST_HEAD(h->members[i]); + + h->hashsize = hashsize; + h->limit = limit; + h->datasize = datasize; + h->hash = hash; + h->compare = compare; + + return h; +} + +void hashtable_destroy(struct hashtable *h) +{ + hashtable_flush(h); + free(h); +} + +void *hashtable_add(struct hashtable *table, void *data) +{ + uint32_t id; + struct slist_head *e; + struct hashtable_node *n; + + /* hash table is full */ + if (table->count >= table->limit) { + errno = ENOSPC; + return 0; + } + + id = table->hash(data, table); + + slist_for_each(e, &table->members[id]) { + n = slist_entry(e, struct hashtable_node, head); + if (table->compare(n->data, data)) { + errno = EEXIST; + return 0; + } + } + + n = hashtable_alloc_node(table->datasize, data); + if (n == NULL) { + errno = ENOMEM; + return NULL; + } + + slist_add(&table->members[id], &n->head); + table->count++; + + return n->data; +} + +void *hashtable_get(struct hashtable *table, const void *data) +{ + struct slist_head *e; + uint32_t id; + struct hashtable_node *n; + + id = table->hash(data, table); + + slist_for_each(e, &table->members[id]) { + n = slist_entry(e, struct hashtable_node, head); + if (table->compare(n->data, data)) + return n->data; + } + + errno = ENOENT; + return NULL; +} + +int hashtable_del(struct hashtable *table, void *data) +{ + struct slist_head *e, *next, *prev; + uint32_t id; + struct hashtable_node *n; + + id = table->hash(data, table); + + slist_for_each_safe(e, prev, next, &table->members[id]) { + n = slist_entry(e, struct hashtable_node, head); + if (table->compare(n->data, data)) { + slist_del(e, prev); + hashtable_destroy_node(n); + table->count--; + return 0; + } + } + errno = ENOENT; + return -1; +} + +int hashtable_flush(struct hashtable *table) +{ + uint32_t i; + struct slist_head *e, *next, *prev; + struct hashtable_node *n; + + for (i=0; i < table->hashsize; i++) + slist_for_each_safe(e, prev, next, &table->members[i]) { + n = slist_entry(e, struct hashtable_node, head); + slist_del(e, prev); + hashtable_destroy_node(n); + } + + table->count = 0; + + return 0; +} + +int hashtable_iterate(struct hashtable *table, void *data, + int (*iterate)(void *data1, void *data2)) +{ + uint32_t i; + struct slist_head *e, *next, *prev; + struct hashtable_node *n; + + for (i=0; i < table->hashsize; i++) { + slist_for_each_safe(e, prev, next, &table->members[i]) { + n = slist_entry(e, struct hashtable_node, head); + if (iterate(data, n->data) == -1) + return -1; + } + } + return 0; +} + +unsigned int hashtable_counter(const struct hashtable *table) +{ + return table->count; +} Index: ulogd2/include/ulogd/hash.h =================================================================== --- /dev/null 1970-01-01 00:00:00.000000000 +0000 +++ ulogd2/include/ulogd/hash.h 2008-05-14 13:46:52.000000000 +0200 @@ -0,0 +1,48 @@ +#ifndef _NF_SET_HASH_H_ +#define _NF_SET_HASH_H_ + +#include <unistd.h> +#include "slist.h" +#include <ulogd/linuxlist.h> + +#include <stdint.h> + +struct hashtable; +struct hashtable_node; + +struct hashtable { + uint32_t hashsize; + uint32_t limit; + uint32_t count; + uint32_t initval; + uint32_t datasize; + + uint32_t (*hash)(const void *data, struct hashtable *table); + int (*compare)(const void *data1, const void *data2); + + struct slist_head members[0]; +}; + +struct hashtable_node { + struct slist_head head; + char data[0]; +}; + +struct hashtable_node *hashtable_alloc_node(int datasize, void *data); +void hashtable_destroy_node(struct hashtable_node *h); + +struct hashtable * +hashtable_create(int hashsize, int limit, int datasize, + uint32_t (*hash)(const void *data, struct hashtable *table), + int (*compare)(const void *data1, const void *data2)); +void hashtable_destroy(struct hashtable *h); + +void *hashtable_add(struct hashtable *table, void *data); +void *hashtable_get(struct hashtable *table, const void *data); +int hashtable_del(struct hashtable *table, void *data); +int hashtable_flush(struct hashtable *table); +int hashtable_iterate(struct hashtable *table, void *data, + int (*iterate)(void *data1, void *data2)); +unsigned int hashtable_counter(const struct hashtable *table); + +#endif