From: Rao Shoaib <rao.shoaib@xxxxxxxxxx> This patch updates kfree_rcu to use new bulk memory free functions as they are more efficient. It also moves kfree_call_rcu() out of rcu related code to mm/slab_common.c Signed-off-by: Rao Shoaib <rao.shoaib@xxxxxxxxxx> --- include/linux/mm.h | 5 ++ kernel/rcu/tree.c | 14 ---- kernel/sysctl.c | 40 +++++++++++ mm/slab.h | 23 +++++++ mm/slab_common.c | 198 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 264 insertions(+), 16 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index ea818ff..8ae4f25 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2669,5 +2669,10 @@ void __init setup_nr_node_ids(void); static inline void setup_nr_node_ids(void) {} #endif +extern int sysctl_kfree_rcu_drain_limit; +extern int sysctl_kfree_rcu_poll_limit; +extern int sysctl_kfree_rcu_empty_limit; +extern int sysctl_kfree_rcu_caching_allowed; + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f9c0ca2..69951ef 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3209,20 +3209,6 @@ void call_rcu_bh(struct rcu_head *head, rcu_callback_t func) EXPORT_SYMBOL_GPL(call_rcu_bh); /* - * Queue an RCU callback for lazy invocation after a grace period. - * This will likely be later named something like "call_rcu_lazy()", - * but this change will require some way of tagging the lazy RCU - * callbacks in the list of pending callbacks. Until then, this - * function may only be called from __kfree_rcu(). - */ -void kfree_call_rcu(struct rcu_head *head, - rcu_callback_t func) -{ - __call_rcu(head, func, rcu_state_p, -1, 1); -} -EXPORT_SYMBOL_GPL(kfree_call_rcu); - -/* * Because a context switch is a grace period for RCU-sched and RCU-bh, * any blocking grace-period wait automatically implies a grace period * if there is only one CPU online at any point time during execution diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 557d467..47b48f7 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1655,6 +1655,46 @@ static struct ctl_table vm_table[] = { .extra2 = (void *)&mmap_rnd_compat_bits_max, }, #endif + { + .procname = "kfree_rcu_drain_limit", + .data = &sysctl_kfree_rcu_drain_limit, + .maxlen = sizeof(sysctl_kfree_rcu_drain_limit), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &one_hundred, + }, + + { + .procname = "kfree_rcu_poll_limit", + .data = &sysctl_kfree_rcu_poll_limit, + .maxlen = sizeof(sysctl_kfree_rcu_poll_limit), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &one_hundred, + }, + + { + .procname = "kfree_rcu_empty_limit", + .data = &sysctl_kfree_rcu_empty_limit, + .maxlen = sizeof(sysctl_kfree_rcu_empty_limit), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &four, + }, + + { + .procname = "kfree_rcu_caching_allowed", + .data = &sysctl_kfree_rcu_caching_allowed, + .maxlen = sizeof(sysctl_kfree_rcu_caching_allowed), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &zero, + .extra2 = &one, + }, + { } }; diff --git a/mm/slab.h b/mm/slab.h index ad657ff..2541f70 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -78,6 +78,29 @@ extern const struct kmalloc_info_struct { unsigned long size; } kmalloc_info[]; +#define RCU_MAX_ACCUMULATE_SIZE 25 + +struct rcu_bulk_free_container { + struct rcu_head rbfc_rcu; + int rbfc_entries; + void *rbfc_data[RCU_MAX_ACCUMULATE_SIZE]; + struct rcu_bulk_free *rbfc_rbf; +}; + +struct rcu_bulk_free { + struct rcu_head rbf_rcu; /* used to schedule monitor process */ + spinlock_t rbf_lock; + struct rcu_bulk_free_container *rbf_container; + struct rcu_bulk_free_container *rbf_cached_container; + struct rcu_head *rbf_list_head; + int rbf_list_size; + int rbf_cpu; + int rbf_empty; + int rbf_polled; + bool rbf_init; + bool rbf_monitor; +}; + unsigned long calculate_alignment(slab_flags_t flags, unsigned long align, unsigned long size); diff --git a/mm/slab_common.c b/mm/slab_common.c index c8cb367..06fd12c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -20,6 +20,7 @@ #include <asm/tlbflush.h> #include <asm/page.h> #include <linux/memcontrol.h> +#include <linux/types.h> #define CREATE_TRACE_POINTS #include <trace/events/kmem.h> @@ -129,6 +130,7 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr, for (i = 0; i < nr; i++) { void *x = p[i] = kmem_cache_alloc(s, flags); + if (!x) { __kmem_cache_free_bulk(s, i, p); return 0; @@ -353,6 +355,7 @@ unsigned long calculate_alignment(slab_flags_t flags, */ if (flags & SLAB_HWCACHE_ALIGN) { unsigned long ralign = cache_line_size(); + while (size <= ralign / 2) ralign /= 2; align = max(align, ralign); @@ -444,9 +447,8 @@ kmem_cache_create(const char *name, size_t size, size_t align, mutex_lock(&slab_mutex); err = kmem_cache_sanity_check(name, size); - if (err) { + if (err) goto out_unlock; - } /* Refuse requests with allocator specific flags */ if (flags & ~SLAB_FLAGS_PERMITTED) { @@ -1131,6 +1133,7 @@ EXPORT_SYMBOL(kmalloc_order); void *kmalloc_order_trace(size_t size, gfp_t flags, unsigned int order) { void *ret = kmalloc_order(size, flags, order); + trace_kmalloc(_RET_IP_, ret, size, PAGE_SIZE << order, flags); return ret; } @@ -1483,6 +1486,197 @@ void kzfree(const void *p) } EXPORT_SYMBOL(kzfree); +static DEFINE_PER_CPU(struct rcu_bulk_free, cpu_rbf); + +/* drain if atleast these many objects */ +int sysctl_kfree_rcu_drain_limit __read_mostly = 10; + +/* time to poll if fewer than drain_limit */ +int sysctl_kfree_rcu_poll_limit __read_mostly = 5; + +/* num of times to check bfr exit */ +int sysctl_kfree_rcu_empty_limit __read_mostly = 2; + +int sysctl_kfree_rcu_caching_allowed __read_mostly = 1; + +/* RCU call back function. Frees the memory */ +static void +__rcu_bulk_free_impl(struct rcu_head *rbfc_rcu) +{ + struct rcu_bulk_free *rbf = NULL; + struct rcu_bulk_free_container *rbfc = container_of(rbfc_rcu, + struct rcu_bulk_free_container, rbfc_rcu); + + WARN_ON(rbfc->rbfc_entries <= 0); + kfree_bulk(rbfc->rbfc_entries, rbfc->rbfc_data); + + rbf = rbfc->rbfc_rbf; + if (!sysctl_kfree_rcu_caching_allowed || + cmpxchg(&rbf->rbf_cached_container, NULL, rbfc) != NULL) { + kfree(rbfc); + } +} + +/* processes list of rcu structures + * used when conatiner can not be allocated + */ + +static void +__rcu_bulk_schedule_list(struct rcu_bulk_free *rbf) +{ + int i = 0; + + for (i = 0; i < rbf->rbf_list_size; i++) { + struct rcu_head *free_head; + + free_head = rbf->rbf_list_head; + rbf->rbf_list_head = free_head->next; + free_head->next = NULL; + call_rcu(free_head, free_head->func); + } + WARN_ON(rbf->rbf_list_head != NULL); + rbf->rbf_list_size = 0; +} + + +/* RCU monitoring function -- submits elements for RCU reclaim */ +static void +__rcu_bulk_free_monitor(struct rcu_head *rbf_rcu) +{ + struct rcu_bulk_free *rbf = NULL; + struct rcu_bulk_free_container *rbfc = NULL; + + rbf = container_of(rbf_rcu, struct rcu_bulk_free, rbf_rcu); + + spin_lock(&rbf->rbf_lock); + + rbfc = rbf->rbf_container; + + if (rbf->rbf_list_size > 0) { + WARN_ON(rbfc != NULL); + if ((rbf->rbf_list_size >= sysctl_kfree_rcu_drain_limit) || + rbf->rbf_polled >= sysctl_kfree_rcu_poll_limit) { + rbf->rbf_polled = 0; + __rcu_bulk_schedule_list(rbf); + } else { + rbf->rbf_polled++; + } + } else if (rbfc != NULL) { + WARN_ON(rbfc->rbfc_entries <= 0); + if ((rbfc->rbfc_entries > sysctl_kfree_rcu_drain_limit) || + rbf->rbf_polled++ >= sysctl_kfree_rcu_poll_limit) { + rbf->rbf_polled = 0; + call_rcu(&rbfc->rbfc_rcu, __rcu_bulk_free_impl); + rbf->rbf_container = NULL; + } + } else { + /* Nothing to do, keep track */ + rbf->rbf_empty++; + } + + if (rbf->rbf_empty >= sysctl_kfree_rcu_empty_limit) { + rbf->rbf_monitor = false; + rbf->rbf_empty = 0; + } + + spin_unlock(&rbf->rbf_lock); + + if (rbf->rbf_monitor) + call_rcu(&rbf->rbf_rcu, __rcu_bulk_free_monitor); +} + +/* Main RCU function that is called to free RCU structures */ +static void +__rcu_bulk_free(struct rcu_head *head, rcu_callback_t func, int cpu, bool lazy) +{ + unsigned long offset; + void *ptr; + struct rcu_bulk_free *rbf; + struct rcu_bulk_free_container *rbfc = NULL; + + rbf = this_cpu_ptr(&cpu_rbf); + + if (unlikely(!rbf->rbf_init)) { + spin_lock_init(&rbf->rbf_lock); + rbf->rbf_cpu = smp_processor_id(); + rbf->rbf_init = true; + } + + /* hold lock to protect against other cpu's */ + spin_lock_bh(&rbf->rbf_lock); + + rbfc = rbf->rbf_container; + + if (rbfc == NULL) { + if (rbf->rbf_cached_container == NULL) { + rbf->rbf_container = + kmalloc(sizeof(struct rcu_bulk_free_container), + GFP_ATOMIC); + rbf->rbf_container->rbfc_rbf = rbf; + } else { + rbf->rbf_container = rbf->rbf_cached_container; + rbf->rbf_container->rbfc_rbf = rbf; + cmpxchg(&rbf->rbf_cached_container, + rbf->rbf_cached_container, NULL); + } + + if (unlikely(rbf->rbf_container == NULL)) { + + /* Memory allocation failed maintain a list */ + + head->func = (void *)func; + head->next = rbf->rbf_list_head; + rbf->rbf_list_head = head; + rbf->rbf_list_size++; + if (rbf->rbf_list_size == RCU_MAX_ACCUMULATE_SIZE) + __rcu_bulk_schedule_list(rbf); + + goto done; + } + + rbfc = rbf->rbf_container; + rbfc->rbfc_entries = 0; + + if (rbf->rbf_list_head != NULL) + __rcu_bulk_schedule_list(rbf); + } + + offset = (unsigned long)func; + ptr = (void *)head - offset; + + rbfc->rbfc_data[rbfc->rbfc_entries++] = ptr; + if (rbfc->rbfc_entries == RCU_MAX_ACCUMULATE_SIZE) { + + WRITE_ONCE(rbf->rbf_container, NULL); + spin_unlock_bh(&rbf->rbf_lock); + call_rcu(&rbfc->rbfc_rcu, __rcu_bulk_free_impl); + return; + } + +done: + if (!rbf->rbf_monitor) { + + call_rcu(&rbf->rbf_rcu, __rcu_bulk_free_monitor); + rbf->rbf_monitor = true; + } + + spin_unlock_bh(&rbf->rbf_lock); +} + +/* + * Queue an RCU callback for lazy invocation after a grace period. + * This will likely be later named something like "call_rcu_lazy()", + * but this change will require some way of tagging the lazy RCU + * callbacks in the list of pending callbacks. Until then, this + * function may only be called from __kfree_rcu(). + */ +void kfree_call_rcu(struct rcu_head *head, rcu_callback_t func) +{ + __rcu_bulk_free(head, func, -1, 1); +} +EXPORT_SYMBOL_GPL(kfree_call_rcu); + + /* Tracepoints definitions. */ EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); -- 2.7.4 -- To unsubscribe, send a message with 'unsubscribe linux-mm' in the body to majordomo@xxxxxxxxx. For more info on Linux MM, see: http://www.linux-mm.org/ . Don't email: <a href=mailto:"dont@xxxxxxxxx"> email@xxxxxxxxx </a>