On Tue 31-10-17 14:50:58, Waiman Long wrote: > The dlock list needs one list for each of the CPUs available. However, > for sibling CPUs, they are sharing the L2 and probably L1 caches > too. As a result, there is not much to gain in term of avoiding > cacheline contention while increasing the cacheline footprint of the > L1/L2 caches as separate lists may need to be in the cache. > > This patch makes all the sibling CPUs share the same list, thus > reducing the number of lists that need to be maintained in each > dlock list without having any noticeable impact on performance. It > also improves dlock list iteration performance as fewer lists need > to be iterated. > > Signed-off-by: Waiman Long <longman@xxxxxxxxxx> The patch looks good to me. You can add: Reviewed-by: Jan Kara <jack@xxxxxxx> Honza > --- > lib/dlock-list.c | 74 ++++++++++++++++++++++++++++++++++++++++++++------------ > 1 file changed, 59 insertions(+), 15 deletions(-) > > diff --git a/lib/dlock-list.c b/lib/dlock-list.c > index 17ced06..a4ddecc 100644 > --- a/lib/dlock-list.c > +++ b/lib/dlock-list.c > @@ -25,31 +25,65 @@ > * The distributed and locked list is a distributed set of lists each of > * which is protected by its own spinlock, but acts like a single > * consolidated list to the callers. For scaling purpose, the number of > - * lists used is equal to the number of possible CPUs in the system to > - * minimize contention. > + * lists used is equal to the number of possible cores in the system to > + * minimize contention. All threads of the same CPU core will share the > + * same list. > * > - * However, it is possible that individual CPU numbers may be equal to > - * or greater than the number of possible CPUs when there are holes in > - * the CPU number list. As a result, we need to map the CPU number to a > - * list index. > + * We need to map each CPU number to a list index. > */ > static DEFINE_PER_CPU_READ_MOSTLY(int, cpu2idx); > +static int nr_dlock_lists __read_mostly; > > /* > - * Initialize cpu2idx mapping table > + * Initialize cpu2idx mapping table & nr_dlock_lists. > * > * It is possible that a dlock-list can be allocated before the cpu2idx is > * initialized. In this case, all the cpus are mapped to the first entry > * before initialization. > * > + * All the sibling CPUs of a sibling group will map to the same dlock list so > + * as to reduce the number of dlock lists to be maintained while minimizing > + * cacheline contention. > + * > + * As the sibling masks are set up in the core initcall phase, this function > + * has to be done in the postcore phase to get the right data. > */ > static int __init cpu2idx_init(void) > { > int idx, cpu; > + struct cpumask *sibling_mask; > + static struct cpumask mask __initdata; > > + cpumask_clear(&mask); > idx = 0; > - for_each_possible_cpu(cpu) > - per_cpu(cpu2idx, cpu) = idx++; > + for_each_possible_cpu(cpu) { > + int scpu; > + > + if (cpumask_test_cpu(cpu, &mask)) > + continue; > + per_cpu(cpu2idx, cpu) = idx; > + cpumask_set_cpu(cpu, &mask); > + > + sibling_mask = topology_sibling_cpumask(cpu); > + if (sibling_mask) { > + for_each_cpu(scpu, sibling_mask) { > + per_cpu(cpu2idx, scpu) = idx; > + cpumask_set_cpu(scpu, &mask); > + } > + } > + idx++; > + } > + > + /* > + * nr_dlock_lists can only be set after cpu2idx is properly > + * initialized. > + */ > + smp_mb(); > + nr_dlock_lists = idx; > + WARN_ON(nr_dlock_lists > nr_cpu_ids); > + > + pr_info("dlock-list: %d head entries per dlock list.\n", > + nr_dlock_lists); > return 0; > } > postcore_initcall(cpu2idx_init); > @@ -67,19 +101,23 @@ static int __init cpu2idx_init(void) > * > * Dynamically allocated locks need to have their own special lock class > * to avoid lockdep warning. > + * > + * Since nr_dlock_lists will always be <= nr_cpu_ids, having more lists > + * than necessary allocated is not a problem other than some wasted memory. > + * The extra lists will not be ever used as all the cpu2idx entries will be > + * 0 before initialization. > */ > int __alloc_dlock_list_heads(struct dlock_list_heads *dlist, > struct lock_class_key *key) > { > - int idx; > + int idx, cnt = nr_dlock_lists ? nr_dlock_lists : nr_cpu_ids; > > - dlist->heads = kcalloc(nr_cpu_ids, sizeof(struct dlock_list_head), > - GFP_KERNEL); > + dlist->heads = kcalloc(cnt, sizeof(struct dlock_list_head), GFP_KERNEL); > > if (!dlist->heads) > return -ENOMEM; > > - for (idx = 0; idx < nr_cpu_ids; idx++) { > + for (idx = 0; idx < cnt; idx++) { > struct dlock_list_head *head = &dlist->heads[idx]; > > INIT_LIST_HEAD(&head->list); > @@ -117,7 +155,10 @@ bool dlock_lists_empty(struct dlock_list_heads *dlist) > { > int idx; > > - for (idx = 0; idx < nr_cpu_ids; idx++) > + /* Shouldn't be called before nr_dlock_lists is initialized */ > + WARN_ON_ONCE(!nr_dlock_lists); > + > + for (idx = 0; idx < nr_dlock_lists; idx++) > if (!list_empty(&dlist->heads[idx].list)) > return false; > return true; > @@ -199,6 +240,9 @@ struct dlock_list_node *__dlock_list_next_list(struct dlock_list_iter *iter) > struct dlock_list_node *next; > struct dlock_list_head *head; > > + /* Shouldn't be called before nr_dlock_lists is initialized */ > + WARN_ON_ONCE(!nr_dlock_lists); > + > restart: > if (iter->entry) { > spin_unlock(&iter->entry->lock); > @@ -209,7 +253,7 @@ struct dlock_list_node *__dlock_list_next_list(struct dlock_list_iter *iter) > /* > * Try next list > */ > - if (++iter->index >= nr_cpu_ids) > + if (++iter->index >= nr_dlock_lists) > return NULL; /* All the entries iterated */ > > if (list_empty(&iter->head[iter->index].list)) > -- > 1.8.3.1 > > -- Jan Kara <jack@xxxxxxxx> SUSE Labs, CR