Re: [PATCH] nfnetlink_queue: use hash table to speed up entry lookup

Changli Gao <xiaosuo@xxxxxxxxx> · Sat, 1 May 2010 08:14:43 +0800

On Thu, Apr 22, 2010 at 4:23 AM, Paul E. McKenney
<paulmck@xxxxxxxxxxxxxxxxxx> wrote:
> On Tue, Apr 20, 2010 at 10:31:36PM +0800, Changli Gao wrote:
>> use hash table to speed up entry lookup
>>
>> A hash table is used to speed up entry lookup when the verdicts aren't received
>> in order. The size of hash table can be specified by NFQA_CFG_QUEUE_HTBLSIZ.
>> Its default value is 1. Reciprocal division is used to lower the cost of
>> division, and the entry IDs are generated carefully to get fair entry
>> distribution in the buckets of the hash table.
>
> A few questions interspersed below.
>
>                                                        Thanx, Paul
>
>> Signed-off-by: Changli Gao <xiaosuo@xxxxxxxxx>
>> ----
>>  include/linux/netfilter/nfnetlink_queue.h |    1
>>  init/Kconfig                              |    1
>>  lib/Kconfig                               |    3
>>  lib/Makefile                              |    4
>>  lib/reciprocal_div.c                      |    2
>>  net/netfilter/Kconfig                     |    1
>>  net/netfilter/nfnetlink_queue.c           |  252 +++++++++++++++++++++++++-----
>>  7 files changed, 227 insertions(+), 37 deletions(-)
>> diff --git a/include/linux/netfilter/nfnetlink_queue.h b/include/linux/netfilter/nfnetlink_queue.h
>> index 2455fe5..77b1566 100644
>> --- a/include/linux/netfilter/nfnetlink_queue.h
>> +++ b/include/linux/netfilter/nfnetlink_queue.h
>> @@ -83,6 +83,7 @@ enum nfqnl_attr_config {
>>       NFQA_CFG_CMD,                   /* nfqnl_msg_config_cmd */
>>       NFQA_CFG_PARAMS,                /* nfqnl_msg_config_params */
>>       NFQA_CFG_QUEUE_MAXLEN,          /* __u32 */
>> +     NFQA_CFG_QUEUE_HTBLSIZ,         /* __u32 */
>>       __NFQA_CFG_MAX
>>  };
>>  #define NFQA_CFG_MAX (__NFQA_CFG_MAX-1)
>> diff --git a/init/Kconfig b/init/Kconfig
>> index cb6069e..4b4266f 100644
>> --- a/init/Kconfig
>> +++ b/init/Kconfig
>> @@ -1059,6 +1059,7 @@ choice
>>
>>  config SLAB
>>       bool "SLAB"
>> +     select RECIPROCAL_DIV
>>       help
>>         The regular slab allocator that is established and known to work
>>         well in all environments. It organizes cache hot objects in
>> diff --git a/lib/Kconfig b/lib/Kconfig
>> index af12831..0c4b5ec 100644
>> --- a/lib/Kconfig
>> +++ b/lib/Kconfig
>> @@ -231,4 +231,7 @@ config IOQ
>>
>>        If unsure, say N
>>
>> +config RECIPROCAL_DIV
>> +       bool
>> +
>>  endmenu
>> diff --git a/lib/Makefile b/lib/Makefile
>> index 0a6ab6f..c3555bd 100644
>> --- a/lib/Makefile
>> +++ b/lib/Makefile
>> @@ -10,7 +10,7 @@ endif
>>  lib-y := ctype.o string.o vsprintf.o cmdline.o \
>>        rbtree.o radix-tree.o dump_stack.o \
>>        idr.o int_sqrt.o extable.o prio_tree.o \
>> -      sha1.o irq_regs.o reciprocal_div.o argv_split.o \
>> +      sha1.o irq_regs.o argv_split.o \
>>        proportions.o prio_heap.o ratelimit.o show_mem.o \
>>        is_single_threaded.o plist.o decompress.o flex_array.o
>>
>> @@ -103,6 +103,8 @@ obj-$(CONFIG_GENERIC_CSUM) += checksum.o
>>
>>  obj-$(CONFIG_GENERIC_ATOMIC64) += atomic64.o
>>
>> +obj-$(CONFIG_RECIPROCAL_DIV) += reciprocal_div.o
>> +
>>  hostprogs-y  := gen_crc32table
>>  clean-files  := crc32table.h
>>
>> diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c
>> index 6a3bd48..39f2e5e 100644
>> --- a/lib/reciprocal_div.c
>> +++ b/lib/reciprocal_div.c
>> @@ -1,5 +1,6 @@
>>  #include <asm/div64.h>
>>  #include <linux/reciprocal_div.h>
>> +#include <linux/module.h>
>>
>>  u32 reciprocal_value(u32 k)
>>  {
>> @@ -7,3 +8,4 @@ u32 reciprocal_value(u32 k)
>>       do_div(val, k);
>>       return (u32)val;
>>  }
>> +EXPORT_SYMBOL(reciprocal_value);
>> diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
>> index 18d77b5..40b34d5 100644
>> --- a/net/netfilter/Kconfig
>> +++ b/net/netfilter/Kconfig
>> @@ -8,6 +8,7 @@ config NETFILTER_NETLINK_QUEUE
>>       tristate "Netfilter NFQUEUE over NFNETLINK interface"
>>       depends on NETFILTER_ADVANCED
>>       select NETFILTER_NETLINK
>> +     select RECIPROCAL_DIV
>>       help
>>         If this option is enabled, the kernel will include support
>>         for queueing packets via NFNETLINK.
>> diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
>> index e70a6ef..d3d02b7 100644
>> --- a/net/netfilter/nfnetlink_queue.c
>> +++ b/net/netfilter/nfnetlink_queue.c
>> @@ -28,6 +28,8 @@
>>  #include <linux/netfilter/nfnetlink.h>
>>  #include <linux/netfilter/nfnetlink_queue.h>
>>  #include <linux/list.h>
>> +#include <linux/vmalloc.h>
>> +#include <linux/reciprocal_div.h>
>>  #include <net/sock.h>
>>  #include <net/netfilter/nf_queue.h>
>>
>> @@ -37,11 +39,13 @@
>>  #include "../bridge/br_private.h"
>>  #endif
>>
>> -#define NFQNL_QMAX_DEFAULT 1024
>> +#define NFQNL_QMAX_DEFAULT   1024
>> +#define NFQNL_QHTBLSIZ_DEFAULT       1
>>
>>  struct nfqnl_instance {
>>       struct hlist_node hlist;                /* global list of queues */
>>       struct rcu_head rcu;
>> +     struct work_struct work;
>>
>>       int peer_pid;
>>       unsigned int queue_maxlen;
>> @@ -49,15 +53,21 @@ struct nfqnl_instance {
>>       unsigned int queue_total;
>>       unsigned int queue_dropped;
>>       unsigned int queue_user_dropped;
>> +     unsigned int queue_htblsiz;
>> +     u32 reciprocal_value;
>>
>>       unsigned int id_sequence;               /* 'sequence' of pkt ids */
>> +     unsigned int id_increment;
>> +     unsigned int id_offset;
>> +     unsigned int id_limit;
>>
>>       u_int16_t queue_num;                    /* number of this queue */
>>       u_int8_t copy_mode;
>>
>>       spinlock_t lock;
>>
>> -     struct list_head queue_list;            /* packets in queue */
>> +     struct list_head *queue_htbl;           /* packets in queue */
>> +     bool vmalloc;
>>  };
>>
>>  typedef int (*nfqnl_cmpfn)(struct nf_queue_entry *, unsigned long);
>> @@ -87,49 +97,87 @@ instance_lookup(u_int16_t queue_num)
>>       return NULL;
>>  }
>>
>> +static void instance_destroy_work(struct work_struct *work)
>> +{
>> +     struct nfqnl_instance *inst;
>> +
>> +     inst = container_of(work, struct nfqnl_instance, work);
>> +     if (inst->vmalloc)
>> +             vfree(inst->queue_htbl);
>> +     else
>> +             kfree(inst->queue_htbl);
>> +     kfree(inst);
>> +     module_put(THIS_MODULE);
>> +}
>> +
>>  static struct nfqnl_instance *
>>  instance_create(u_int16_t queue_num, int pid)
>>  {
>>       struct nfqnl_instance *inst;
>> -     unsigned int h;
>> +     unsigned int h, i;
>>       int err;
>>
>> -     spin_lock(&instances_lock);
>> -     if (instance_lookup(queue_num)) {
>> -             err = -EEXIST;
>> -             goto out_unlock;
>> -     }
>> -
>> +     rcu_read_unlock();
>
> This seems strange -- are all the callers aware that instance_create()
> temporarily exits the RCU read-side critical section?

There is only one caller, so it is awares. But it seems that I should
add suffix "_rcu_read_locked()" to instrance_create().

>
>>       inst = kzalloc(sizeof(*inst), GFP_ATOMIC);
>>       if (!inst) {
>>               err = -ENOMEM;
>> -             goto out_unlock;
>> +             goto out_lock;
>>       }
>>
>> +     INIT_WORK(&inst->work, instance_destroy_work);
>>       inst->queue_num = queue_num;
>>       inst->peer_pid = pid;
>>       inst->queue_maxlen = NFQNL_QMAX_DEFAULT;
>>       inst->copy_range = 0xfffff;
>>       inst->copy_mode = NFQNL_COPY_NONE;
>>       spin_lock_init(&inst->lock);
>> -     INIT_LIST_HEAD(&inst->queue_list);
>> +     inst->queue_htblsiz = NFQNL_QHTBLSIZ_DEFAULT;
>> +     inst->id_increment = INT_MAX / inst->queue_htblsiz;
>> +     inst->id_limit = inst->id_increment * inst->queue_htblsiz;
>> +     inst->reciprocal_value = reciprocal_value(inst->id_increment);
>> +     inst->queue_htbl = kmalloc(sizeof(struct list_head) *
>> +                                inst->queue_htblsiz, GFP_KERNEL);
>> +     if (inst->queue_htbl == NULL) {
>> +             inst->queue_htbl = vmalloc(sizeof(struct list_head) *
>> +                                        inst->queue_htblsiz);
>> +             if (inst->queue_htbl == NULL) {
>> +                     err = -ENOMEM;
>> +                     goto out_free;
>> +             }
>> +             inst->vmalloc = true;
>> +     }
>> +     for (i = 0; i < inst->queue_htblsiz; i++)
>> +             INIT_LIST_HEAD(&inst->queue_htbl[i]);
>>
>>       if (!try_module_get(THIS_MODULE)) {
>>               err = -EAGAIN;
>>               goto out_free;
>>       }
>> +     rcu_read_lock();
>>
>> +     spin_lock(&instances_lock);
>> +     if (instance_lookup(queue_num)) {
>> +             err = -EEXIST;
>> +             spin_unlock(&instances_lock);
>> +             rcu_read_unlock();
>> +             goto out_free;
>> +     }
>>       h = instance_hashfn(queue_num);
>>       hlist_add_head_rcu(&inst->hlist, &instance_table[h]);
>> -
>>       spin_unlock(&instances_lock);
>>
>>       return inst;
>>
>>  out_free:
>> +     if (inst->queue_htbl) {
>> +             if (inst->vmalloc)
>> +                     vfree(inst->queue_htbl);
>> +             else
>> +                     kfree(inst->queue_htbl);
>> +     }
>>       kfree(inst);
>> -out_unlock:
>> -     spin_unlock(&instances_lock);
>> +out_lock:
>> +     rcu_read_lock();
>>       return ERR_PTR(err);
>>  }
>>
>> @@ -143,8 +191,7 @@ instance_destroy_rcu(struct rcu_head *head)
>>                                                  rcu);
>>
>>       nfqnl_flush(inst, NULL, 0);
>> -     kfree(inst);
>> -     module_put(THIS_MODULE);
>> +     schedule_work(&inst->work);
>>  }
>>
>>  static void
>> @@ -162,32 +209,67 @@ instance_destroy(struct nfqnl_instance *inst)
>>       spin_unlock(&instances_lock);
>>  }
>>
>> +static inline struct list_head *nfqnl_head_get(struct nfqnl_instance *queue,
>> +                                            unsigned int id)
>> +{
>> +     return &queue->queue_htbl[reciprocal_divide(id,
>> +                                                 queue->reciprocal_value)];
>> +}
>> +
>> +static struct nf_queue_entry *__find_entry(struct nfqnl_instance *queue,
>> +                                        u32 id)
>> +{
>> +     struct nf_queue_entry *entry;
>> +     struct list_head *head;
>> +
>> +     head = nfqnl_head_get(queue, id);
>> +     list_for_each_entry(entry, head, list) {
>> +             if (entry->id == id)
>> +                     return entry;
>> +     }
>> +
>> +     return NULL;
>> +}
>> +
>> +static u32 __get_uniq_id(struct nfqnl_instance *queue)
>> +{
>> +     u32 i;
>> +
>> +     for (i = 0; i < INT_MAX; i++) {
>> +             queue->id_sequence += queue->id_increment;
>> +             if (queue->id_sequence >= queue->id_limit) {
>> +                     if (++queue->id_offset >= queue->id_increment)
>> +                             queue->id_offset = 0;
>> +                     queue->id_sequence = queue->id_offset;
>> +             }
>> +             if (__find_entry(queue, queue->id_sequence) == NULL)
>> +                     return queue->id_sequence;
>> +     }
>> +
>> +     return INT_MAX;
>> +}
>> +
>>  static inline void
>>  __enqueue_entry(struct nfqnl_instance *queue, struct nf_queue_entry *entry)
>>  {
>> -       list_add_tail(&entry->list, &queue->queue_list);
>> -       queue->queue_total++;
>> +     struct list_head *head;
>> +
>> +     head = nfqnl_head_get(queue, entry->id);
>> +     list_add_tail(&entry->list, head);
>> +     queue->queue_total++;
>>  }
>>
>>  static struct nf_queue_entry *
>>  find_dequeue_entry(struct nfqnl_instance *queue, unsigned int id)
>>  {
>> -     struct nf_queue_entry *entry = NULL, *i;
>> +     struct nf_queue_entry *entry;
>>
>>       spin_lock_bh(&queue->lock);
>> -
>> -     list_for_each_entry(i, &queue->queue_list, list) {
>> -             if (i->id == id) {
>> -                     entry = i;
>> -                     break;
>> -             }
>> -     }
>> -
>> +     entry = __find_entry(queue, id);
>>       if (entry) {
>>               list_del(&entry->list);
>>               queue->queue_total--;
>>       }
>> -
>>       spin_unlock_bh(&queue->lock);
>>
>>       return entry;
>> @@ -197,13 +279,22 @@ static void
>>  nfqnl_flush(struct nfqnl_instance *queue, nfqnl_cmpfn cmpfn, unsigned long data)
>>  {
>>       struct nf_queue_entry *entry, *next;
>> +     unsigned int i, total;
>> +     struct list_head *head;
>>
>>       spin_lock_bh(&queue->lock);
>> -     list_for_each_entry_safe(entry, next, &queue->queue_list, list) {
>> -             if (!cmpfn || cmpfn(entry, data)) {
>> -                     list_del(&entry->list);
>> -                     queue->queue_total--;
>> -                     nf_reinject(entry, NF_DROP);
>> +     total = queue->queue_total;
>> +     for (i = 0; i < queue->queue_htblsiz; i++) {
>> +             if (total < 1)
>> +                     break;
>> +             head = &queue->queue_htbl[i];
>> +             list_for_each_entry_safe(entry, next, head, list) {
>> +                     if (!cmpfn || cmpfn(entry, data)) {
>> +                             list_del(&entry->list);
>> +                             queue->queue_total--;
>> +                             nf_reinject(entry, NF_DROP);
>> +                     }
>> +                     --total;
>>               }
>>       }
>>       spin_unlock_bh(&queue->lock);
>> @@ -262,7 +353,12 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
>>               break;
>>       }
>>
>> -     entry->id = queue->id_sequence++;
>> +     entry->id = __get_uniq_id(queue);
>> +     if (entry->id == INT_MAX) {
>> +             spin_unlock_bh(&queue->lock);
>> +             return NULL;
>> +     }
>> +     __enqueue_entry(queue, entry);
>>
>>       spin_unlock_bh(&queue->lock);
>>
>> @@ -379,6 +475,7 @@ nfqnl_build_packet_message(struct nfqnl_instance *queue,
>>
>>  nlmsg_failure:
>>  nla_put_failure:
>> +     find_dequeue_entry(queue, entry->id);
>>       if (skb)
>>               kfree_skb(skb);
>>       if (net_ratelimit())
>> @@ -426,14 +523,14 @@ nfqnl_enqueue_packet(struct nf_queue_entry *entry, unsigned int queuenum)
>>               goto err_out_unlock;
>>       }
>>
>> -     __enqueue_entry(queue, entry);
>> -
>>       spin_unlock_bh(&queue->lock);
>>       return 0;
>>
>>  err_out_free_nskb:
>>       kfree_skb(nskb);
>>  err_out_unlock:
>> +     list_del(&entry->list);
>> +     queue->queue_total--;
>>       spin_unlock_bh(&queue->lock);
>>  err_out:
>>       return -1;
>> @@ -686,6 +783,77 @@ static const struct nf_queue_handler nfqh = {
>>       .outfn  = &nfqnl_enqueue_packet,
>>  };
>>
>> +static int nfqnl_htbl_resize(u16 queue_num, int pid, unsigned int size)
>> +{
>> +     struct nfqnl_instance *queue;
>> +     unsigned int i, total;
>> +     struct list_head *h, *htbl;
>> +     bool is_vmalloc;
>> +     int err;
>> +
>> +     if (size < 1 || size > INT_MAX)
>> +             return -EINVAL;
>> +
>> +     rcu_read_unlock();
>
> Again, this seems strange.  As near as I can tell, the caller immediately
> exits the RCU read-side critical section, so why not have the caller do
> the exit immediately before calling nfqnl_htbl_resize()?
>

It sounds reasonable. Thanks.

-- 
Regards，
Changli Gao(xiaosuo@xxxxxxxxx)
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html