From: Patrick Kelsey <pat.kelsey@xxxxxxxxxxxxxxxxxxxx> Place struct mmu_rb_handler on cache line start like so: struct mmu_rb_handler *h; void *free_ptr; int ret; free_ptr = kzalloc(sizeof(*h) + cache_line_size() - 1, GFP_KERNEL); if (!free_ptr) return -ENOMEM; h = PTR_ALIGN(free_ptr, cache_line_size()); Additionally, move struct mmu_rb_handler fields "root" and "ops_args" to start after the next cacheline using the "____cacheline_aligned_in_smp" annotation. Allocating an additional cache_line_size() - 1 bytes to place struct mmu_rb_handler on a cache line start does increase memory consumption. However, few struct mmu_rb_handler are created when hfi1 is in use. As mmu_rb_handler->root and mmu_rb_handler->ops_args are accessed frequently, the advantage of having them both within a cache line is expected to outweigh the disadvantage of the additional memory consumption per struct mmu_rb_handler. Signed-off-by: Brendan Cunningham <bcunningham@xxxxxxxxxxxxxxxxxxxx> Signed-off-by: Patrick Kelsey <pat.kelsey@xxxxxxxxxxxxxxxxxxxx> Signed-off-by: Dennis Dalessandro <dennis.dalessandro@xxxxxxxxxxxxxxxxxxxx> --- drivers/infiniband/hw/hfi1/mmu_rb.c | 11 +++++++---- drivers/infiniband/hw/hfi1/mmu_rb.h | 14 ++++++++++++-- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index 71b9ac018887..1cea8b0c78e0 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -46,12 +46,14 @@ int hfi1_mmu_rb_register(void *ops_arg, struct mmu_rb_handler **handler) { struct mmu_rb_handler *h; + void *free_ptr; int ret; - h = kzalloc(sizeof(*h), GFP_KERNEL); - if (!h) + free_ptr = kzalloc(sizeof(*h) + cache_line_size() - 1, GFP_KERNEL); + if (!free_ptr) return -ENOMEM; + h = PTR_ALIGN(free_ptr, cache_line_size()); h->root = RB_ROOT_CACHED; h->ops = ops; h->ops_arg = ops_arg; @@ -62,10 +64,11 @@ int hfi1_mmu_rb_register(void *ops_arg, INIT_LIST_HEAD(&h->del_list); INIT_LIST_HEAD(&h->lru_list); h->wq = wq; + h->free_ptr = free_ptr; ret = mmu_notifier_register(&h->mn, current->mm); if (ret) { - kfree(h); + kfree(free_ptr); return ret; } @@ -108,7 +111,7 @@ void hfi1_mmu_rb_unregister(struct mmu_rb_handler *handler) /* Now the mm may be freed. */ mmdrop(handler->mn.mm); - kfree(handler); + kfree(handler->free_ptr); } int hfi1_mmu_rb_insert(struct mmu_rb_handler *handler, diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.h b/drivers/infiniband/hw/hfi1/mmu_rb.h index ed75acdb7b83..c4da064188c9 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.h +++ b/drivers/infiniband/hw/hfi1/mmu_rb.h @@ -33,15 +33,25 @@ struct mmu_rb_ops { }; struct mmu_rb_handler { + /* + * struct mmu_notifier is 56 bytes, and spinlock_t is 4 bytes, so + * they fit together in one cache line. mn is relatively rarely + * accessed, so co-locating the spinlock with it achieves much of + * the cacheline contention reduction of giving the spinlock its own + * cacheline without the overhead of doing so. + */ struct mmu_notifier mn; - struct rb_root_cached root; - void *ops_arg; spinlock_t lock; /* protect the RB tree */ + + /* Begin on a new cachline boundary here */ + struct rb_root_cached root ____cacheline_aligned_in_smp; + void *ops_arg; struct mmu_rb_ops *ops; struct list_head lru_list; struct work_struct del_work; struct list_head del_list; struct workqueue_struct *wq; + void *free_ptr; }; int hfi1_mmu_rb_register(void *ops_arg,