On Thu, Apr 22, 2010 at 4:23 AM, Paul E. McKenney <paulmck@xxxxxxxxxxxxxxxxxx> wrote: > On Tue, Apr 20, 2010 at 10:31:36PM +0800, Changli Gao wrote: >> use hash table to speed up entry lookup >> >> A hash table is used to speed up entry lookup when the verdicts aren't received >> in order. The size of hash table can be specified by NFQA_CFG_QUEUE_HTBLSIZ. >> Its default value is 1. Reciprocal division is used to lower the cost of >> division, and the entry IDs are generated carefully to get fair entry >> distribution in the buckets of the hash table. > > A few questions interspersed below. > > Thanx, Paul > >> Signed-off-by: Changli Gao <xiaosuo@xxxxxxxxx> >> ---- >> include/linux/netfilter/nfnetlink_queue.h | 1 >> init/Kconfig | 1 >> lib/Kconfig | 3 >> lib/Makefile | 4 >> lib/reciprocal_div.c | 2 >> net/netfilter/Kconfig | 1 >> net/netfilter/nfnetlink_queue.c | 252 +++++++++++++++++++++++++----- >> 7 files changed, 227 insertions(+), 37 deletions(-) >> diff --git a/include/linux/netfilter/nfnetlink_queue.h b/include/linux/netfilter/nfnetlink_queue.h >> index 2455fe5..77b1566 100644 >> --- a/include/linux/netfilter/nfnetlink_queue.h >> +++ b/include/linux/netfilter/nfnetlink_queue.h >> @@ -83,6 +83,7 @@ enum nfqnl_attr_config { >> NFQA_CFG_CMD, /* nfqnl_msg_config_cmd */ >> NFQA_CFG_PARAMS, /* nfqnl_msg_config_params */ >> NFQA_CFG_QUEUE_MAXLEN, /* __u32 */ >> + NFQA_CFG_QUEUE_HTBLSIZ, /* __u32 */ >> __NFQA_CFG_MAX >> }; >> #define NFQA_CFG_MAX (__NFQA_CFG_MAX-1) >> diff --git a/init/Kconfig b/init/Kconfig >> index cb6069e..4b4266f 100644 >> --- a/init/Kconfig >> +++ b/init/Kconfig >> @@ -1059,6 +1059,7 @@ choice >> >> config SLAB >> bool "SLAB" >> + select RECIPROCAL_DIV >> help >> The regular slab allocator that is established and known to work >> well in all environments. It organizes cache hot objects in >> diff --git a/lib/Kconfig b/lib/Kconfig >> index af12831..0c4b5ec 100644 >> --- a/lib/Kconfig >> +++ b/lib/Kconfig >> @@ -231,4 +231,7 @@ config IOQ >> >> If unsure, say N >> >> +config RECIPROCAL_DIV >> + bool >> + >> endmenu >> diff --git a/lib/Makefile b/lib/Makefile >> index 0a6ab6f..c3555bd 100644 >> --- a/lib/Makefile >> +++ b/lib/Makefile >> @@ -10,7 +10,7 @@ endif >> lib-y := ctype.o string.o vsprintf.o cmdline.o \ >> rbtree.o radix-tree.o dump_stack.o \ >> idr.o int_sqrt.o extable.o prio_tree.o \ >> - sha1.o irq_regs.o reciprocal_div.o argv_split.o \ >> + sha1.o irq_regs.o argv_split.o \ >> proportions.o prio_heap.o ratelimit.o show_mem.o \ >> is_single_threaded.o plist.o decompress.o flex_array.o >> >> @@ -103,6 +103,8 @@ obj-$(CONFIG_GENERIC_CSUM) += checksum.o >> >> obj-$(CONFIG_GENERIC_ATOMIC64) += atomic64.o >> >> +obj-$(CONFIG_RECIPROCAL_DIV) += reciprocal_div.o >> + >> hostprogs-y := gen_crc32table >> clean-files := crc32table.h >> >> diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c >> index 6a3bd48..39f2e5e 100644 >> --- a/lib/reciprocal_div.c >> +++ b/lib/reciprocal_div.c >> @@ -1,5 +1,6 @@ >> #include <asm/div64.h> >> #include <linux/reciprocal_div.h> >> +#include <linux/module.h> >> >> u32 reciprocal_value(u32 k) >> { >> @@ -7,3 +8,4 @@ u32 reciprocal_value(u32 k) >> do_div(val, k); >> return (u32)val; >> } >> +EXPORT_SYMBOL(reciprocal_value); >> diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig >> index 18d77b5..40b34d5 100644 >> --- a/net/netfilter/Kconfig >> +++ b/net/netfilter/Kconfig >> @@ -8,6 +8,7 @@ config NETFILTER_NETLINK_QUEUE >> tristate "Netfilter NFQUEUE over NFNETLINK interface" >> depends on NETFILTER_ADVANCED >> select NETFILTER_NETLINK >> + select RECIPROCAL_DIV >> help >> If this option is enabled, the kernel will include support >> for queueing packets via NFNETLINK. >> diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c >> index e70a6ef..d3d02b7 100644 >> --- a/net/netfilter/nfnetlink_queue.c >> +++ b/net/netfilter/nfnetlink_queue.c >> @@ -28,6 +28,8 @@ >> #include <linux/netfilter/nfnetlink.h> >> #include <linux/netfilter/nfnetlink_queue.h> >> #include <linux/list.h> >> +#include <linux/vmalloc.h> >> +#include <linux/reciprocal_div.h> >> #include <net/sock.h> >> #include <net/netfilter/nf_queue.h> >> >> @@ -37,11 +39,13 @@ >> #include "../bridge/br_private.h" >> #endif >> >> -#define NFQNL_QMAX_DEFAULT 1024 >> +#define NFQNL_QMAX_DEFAULT 1024 >> +#define NFQNL_QHTBLSIZ_DEFAULT 1 >> >> struct nfqnl_instance { >> struct hlist_node hlist; /* global list of queues */ >> struct rcu_head rcu; >> + struct work_struct work; >> >> int peer_pid; >> unsigned int queue_maxlen; >> @@ -49,15 +53,21 @@ struct nfqnl_instance { >> unsigned int queue_total; >> unsigned int queue_dropped; >> unsigned int queue_user_dropped; >> + unsigned int queue_htblsiz; >> + u32 reciprocal_value; >> >> unsigned int id_sequence; /* 'sequence' of pkt ids */ >> + unsigned int id_increment; >> + unsigned int id_offset; >> + unsigned int id_limit; >> >> u_int16_t queue_num; /* number of this queue */ >> u_int8_t copy_mode; >> >> spinlock_t lock; >> >> - struct list_head queue_list; /* packets in queue */ >> + struct list_head *queue_htbl; /* packets in queue */ >> + bool vmalloc; >> }; >> >> typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long); >> @@ -87,49 +97,87 @@ instance_lookup(u_int16_t queue_num) >> return NULL; >> } >> >> +static void instance_destroy_work(struct work_struct *work) >> +{ >> + struct nfqnl_instance *inst; >> + >> + inst = container_of(work, struct nfqnl_instance, work); >> + if (inst->vmalloc) >> + vfree(inst->queue_htbl); >> + else >> + kfree(inst->queue_htbl); >> + kfree(inst); >> + module_put(THIS_MODULE); >> +} >> + >> static struct nfqnl_instance * >> instance_create(u_int16_t queue_num, int pid) >> { >> struct nfqnl_instance *inst; >> - unsigned int h; >> + unsigned int h, i; >> int err; >> >> - spin_lock(&instances_lock); >> - if (instance_lookup(queue_num)) { >> - err = -EEXIST; >> - goto out_unlock; >> - } >> - >> + rcu_read_unlock(); > > This seems strange -- are all the callers aware that instance_create() > temporarily exits the RCU read-side critical section? There is only one caller, so it is awares. But it seems that I should add suffix "_rcu_read_locked()" to instrance_create(). > >> inst = kzalloc(sizeof(*inst), GFP_ATOMIC); >> if (!inst) { >> err = -ENOMEM; >> - goto out_unlock; >> + goto out_lock; >> } >> >> + INIT_WORK(&inst->work, instance_destroy_work); >> inst->queue_num = queue_num; >> inst->peer_pid = pid; >> inst->queue_maxlen = NFQNL_QMAX_DEFAULT; >> inst->copy_range = 0xfffff; >> inst->copy_mode = NFQNL_COPY_NONE; >> spin_lock_init(&inst->lock); >> - INIT_LIST_HEAD(&inst->queue_list); >> + inst->queue_htblsiz = NFQNL_QHTBLSIZ_DEFAULT; >> + inst->id_increment = INT_MAX / inst->queue_htblsiz; >> + inst->id_limit = inst->id_increment * inst->queue_htblsiz; >> + inst->reciprocal_value = reciprocal_value(inst->id_increment); >> + inst->queue_htbl = kmalloc(sizeof(struct list_head) * >> + inst->queue_htblsiz, GFP_KERNEL); >> + if (inst->queue_htbl == NULL) { >> + inst->queue_htbl = vmalloc(sizeof(struct list_head) * >> + inst->queue_htblsiz); >> + if (inst->queue_htbl == NULL) { >> + err = -ENOMEM; >> + goto out_free; >> + } >> + inst->vmalloc = true; >> + } >> + for (i = 0; i < inst->queue_htblsiz; i++) >> + INIT_LIST_HEAD(&inst->queue_htbl[i]); >> >> if (!try_module_get(THIS_MODULE)) { >> err = -EAGAIN; >> goto out_free; >> } >> + rcu_read_lock(); >> >> + spin_lock(&instances_lock); >> + if (instance_lookup(queue_num)) { >> + err = -EEXIST; >> + spin_unlock(&instances_lock); >> + rcu_read_unlock(); >> + goto out_free; >> + } >> h = instance_hashfn(queue_num); >> hlist_add_head_rcu(&inst->hlist, &instance_table[h]); >> - >> spin_unlock(&instances_lock); >> >> return inst; >> >> out_free: >> + if (inst->queue_htbl) { >> + if (inst->vmalloc) >> + vfree(inst->queue_htbl); >> + else >> + kfree(inst->queue_htbl); >> + } >> kfree(inst); >> -out_unlock: >> - spin_unlock(&instances_lock); >> +out_lock: >> + rcu_read_lock(); >> return ERR_PTR(err); >> } >> >> @@ -143,8 +191,7 @@ instance_destroy_rcu(struct rcu_head *head) >> rcu); >> >> nfqnl_flush(inst, NULL, 0); >> - kfree(inst); >> - module_put(THIS_MODULE); >> + schedule_work(&inst->work); >> } >> >> static void >> @@ -162,32 +209,67 @@ instance_destroy(struct nfqnl_instance *inst) >> spin_unlock(&instances_lock); >> } >> >> +static inline struct list_head *nfqnl_head_get(struct nfqnl_instance *queue, >> + unsigned int id) >> +{ >> + return &queue->queue_htbl[reciprocal_divide(id, >> + queue->reciprocal_value)]; >> +} >> + >> +static struct nf_queue_entry *__find_entry(struct nfqnl_instance *queue, >> + u32 id) >> +{ >> + struct nf_queue_entry *entry; >> + struct list_head *head; >> + >> + head = nfqnl_head_get(queue, id); >> + list_for_each_entry(entry, head, list) { >> + if (entry->id == id) >> + return entry; >> + } >> + >> + return NULL; >> +} >> + >> +static u32 __get_uniq_id(struct nfqnl_instance *queue) >> +{ >> + u32 i; >> + >> + for (i = 0; i < INT_MAX; i++) { >> + queue->id_sequence += queue->id_increment; >> + if (queue->id_sequence >= queue->id_limit) { >> + if (++queue->id_offset >= queue->id_increment) >> + queue->id_offset = 0; >> + queue->id_sequence = queue->id_offset; >> + } >> + if (__find_entry(queue, queue->id_sequence) == NULL) >> + return queue->id_sequence; >> + } >> + >> + return INT_MAX; >> +} >> + >> static inline void >> __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry) >> { >> - list_add_tail(&entry->list, &queue->queue_list); >> - queue->queue_total++; >> + struct list_head *head; >> + >> + head = nfqnl_head_get(queue, entry->id); >> + list_add_tail(&entry->list, head); >> + queue->queue_total++; >> } >> >> static struct nf_queue_entry * >> find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id) >> { >> - struct nf_queue_entry *entry = NULL, *i; >> + struct nf_queue_entry *entry; >> >> spin_lock_bh(&queue->lock); >> - >> - list_for_each_entry(i, &queue->queue_list, list) { >> - if (i->id == id) { >> - entry = i; >> - break; >> - } >> - } >> - >> + entry = __find_entry(queue, id); >> if (entry) { >> list_del(&entry->list); >> queue->queue_total--; >> } >> - >> spin_unlock_bh(&queue->lock); >> >> return entry; >> @@ -197,13 +279,22 @@ static void >> nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data) >> { >> struct nf_queue_entry *entry, *next; >> + unsigned int i, total; >> + struct list_head *head; >> >> spin_lock_bh(&queue->lock); >> - list_for_each_entry_safe(entry, next, &queue->queue_list, list) { >> - if (!cmpfn || cmpfn(entry, data)) { >> - list_del(&entry->list); >> - queue->queue_total--; >> - nf_reinject(entry, NF_DROP); >> + total = queue->queue_total; >> + for (i = 0; i < queue->queue_htblsiz; i++) { >> + if (total < 1) >> + break; >> + head = &queue->queue_htbl[i]; >> + list_for_each_entry_safe(entry, next, head, list) { >> + if (!cmpfn || cmpfn(entry, data)) { >> + list_del(&entry->list); >> + queue->queue_total--; >> + nf_reinject(entry, NF_DROP); >> + } >> + --total; >> } >> } >> spin_unlock_bh(&queue->lock); >> @@ -262,7 +353,12 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, >> break; >> } >> >> - entry->id = queue->id_sequence++; >> + entry->id = __get_uniq_id(queue); >> + if (entry->id == INT_MAX) { >> + spin_unlock_bh(&queue->lock); >> + return NULL; >> + } >> + __enqueue_entry(queue, entry); >> >> spin_unlock_bh(&queue->lock); >> >> @@ -379,6 +475,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue, >> >> nlmsg_failure: >> nla_put_failure: >> + find_dequeue_entry(queue, entry->id); >> if (skb) >> kfree_skb(skb); >> if (net_ratelimit()) >> @@ -426,14 +523,14 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum) >> goto err_out_unlock; >> } >> >> - __enqueue_entry(queue, entry); >> - >> spin_unlock_bh(&queue->lock); >> return 0; >> >> err_out_free_nskb: >> kfree_skb(nskb); >> err_out_unlock: >> + list_del(&entry->list); >> + queue->queue_total--; >> spin_unlock_bh(&queue->lock); >> err_out: >> return -1; >> @@ -686,6 +783,77 @@ static const struct nf_queue_handler nfqh = { >> .outfn = &nfqnl_enqueue_packet, >> }; >> >> +static int nfqnl_htbl_resize(u16 queue_num, int pid, unsigned int size) >> +{ >> + struct nfqnl_instance *queue; >> + unsigned int i, total; >> + struct list_head *h, *htbl; >> + bool is_vmalloc; >> + int err; >> + >> + if (size < 1 || size > INT_MAX) >> + return -EINVAL; >> + >> + rcu_read_unlock(); > > Again, this seems strange. As near as I can tell, the caller immediately > exits the RCU read-side critical section, so why not have the caller do > the exit immediately before calling nfqnl_htbl_resize()? > It sounds reasonable. Thanks. -- Regards, Changli Gao(xiaosuo@xxxxxxxxx) -- To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html