On Sat, 2021-07-03 at 09:24 +0200, Mike Galbraith wrote: > > It also appears to be saying that there's something RT specific to > stare at in addition to the list_lock business. The what is ___slab_alloc() consuming 3.9% CPU in tip-rt-slub whereas it consumes < 1% in both tip-rt (sans slub patches) and tip-slub. The why remains to ponder. 5.13.0.g60ab3ed-tip-rt 5.13.0.g60ab3ed-tip-rt-slub 5.13.0.g60ab3ed-tip-slub 25.18% copy_user_enhanced_fast_string copy_user_enhanced_fast_string copy_user_enhanced_fast_string 5.08% unix_stream_read_generic unix_stream_read_generic unix_stream_read_generic 3.39% rt_spin_lock *** ___slab_alloc *** __skb_datagram_iter 2.80% __skb_datagram_iter rt_spin_lock _raw_spin_lock 2.11% get_page_from_freelist __skb_datagram_iter __alloc_skb 2.01% skb_release_data rt_spin_unlock skb_release_data 1.94% rt_spin_unlock get_page_from_freelist __alloc_pages 1.85% __alloc_skb migrate_enable unix_stream_sendmsg 1.68% __schedule skb_release_data _raw_spin_lock_irqsave 1.67% unix_stream_sendmsg __schedule free_pcppages_bulk 1.50% free_pcppages_bulk unix_stream_sendmsg __slab_free 1.38% migrate_enable free_pcppages_bulk __fget_light 1.24% __fget_light __alloc_pages vfs_write 1.16% __slab_free migrate_disable __schedule 1.14% __alloc_pages __fget_light get_page_from_freelist 1.10% fsnotify __slab_free new_sync_write 1.07% kfree fsnotify fsnotify 5.13.0.g60ab3ed-tip-rt-slub ___slab_alloc() consumes 3.90% 0.40 │ mov 0x28(%r13),%edx 0.42 │ add %r15,%rdx │ __swab(): │ #endif │ │ static __always_inline unsigned long __swab(const unsigned long y) │ { │ #if __BITS_PER_LONG == 64 │ return __swab64(y); 0.05 │ mov %rdx,%rax 1.14 │ bswap %rax │ freelist_ptr(): │ return (void *)((unsigned long)ptr ^ s->random ^ <== CONFIG_SLAB_FREELIST_HARDENED 0.72 │ xor 0xb0(%r13),%rax 65.41 │ xor (%rdx),%rax <== huh? miss = 65% of that 3.9% kernel util? │ next_tid(): │ return tid + TID_STEP; 0.09 │ addq $0x200,0x48(%r12) │ ___slab_alloc(): │ * freelist is pointing to the list of objects to be used. │ * page is pointing to the page from which the objects are obtained. │ * That page must be frozen for per cpu allocations to work. │ */ │ VM_BUG_ON(!c->page->frozen); │ c->freelist = get_freepointer(s, freelist); 0.05 │ mov %rax,0x40(%r12) │ c->tid = next_tid(c->tid); │ local_unlock_irqrestore(&s->cpu_slab->lock, flags); 5.13.0.g60ab3ed-tip-rt ___slab_alloc() consumes < 1% Percent│ } │ │ /* must check again c->freelist in case of cpu migration or IRQ */ │ freelist = c->freelist; 0.02 │ a1: mov (%r14),%r13 │ if (freelist) │ test %r13,%r13 0.02 │ ↓ je 460 │ get_freepointer(): │ return freelist_dereference(s, object + s->offset); 0.23 │ ad: mov 0x28(%r12),%edx 0.18 │ add %r13,%rdx │ __swab(): │ #endif │ │ static __always_inline unsigned long __swab(const unsigned long y) │ { │ #if __BITS_PER_LONG == 64 │ return __swab64(y); 0.06 │ mov %rdx,%rax 1.16 │ bswap %rax │ freelist_ptr(): │ return (void *)((unsigned long)ptr ^ s->random ^ 0.23 │ xor 0xb0(%r12),%rax 35.25 │ xor (%rdx),%rax <== 35% of < 1% kernel util │ next_tid(): │ return tid + TID_STEP; 0.28 │ addq $0x200,0x8(%r14) │ ___slab_alloc(): │ * freelist is pointing to the list of objects to be used. │ * page is pointing to the page from which the objects are obtained. │ * That page must be frozen for per cpu allocations to work. │ */ │ VM_BUG_ON(!c->page->frozen); │ c->freelist = get_freepointer(s, freelist); 5.13.0.g60ab3ed-tip-slub ___slab_alloc() also consumes < 1% Percent│ load_freelist: │ │ lockdep_assert_held(this_cpu_ptr(&s->cpu_slab->lock)); 0.28 │ 84: add this_cpu_off,%rax │ get_freepointer(): │ return freelist_dereference(s, object + s->offset); 0.14 │ mov 0x28(%r14),%eax │ ___slab_alloc(): │ * freelist is pointing to the list of objects to be used. │ * page is pointing to the page from which the objects are obtained. │ * That page must be frozen for per cpu allocations to work. │ */ │ VM_BUG_ON(!c->page->frozen); │ c->freelist = get_freepointer(s, freelist); 34.36 │ mov 0x0(%r13,%rax,1),%rax │ next_tid(): │ return tid + TID_STEP; 0.10 │ addq $0x1,0x8(%r12) │ ___slab_alloc(): │ c->freelist = get_freepointer(s, freelist); 0.04 │ mov %rax,(%r12) │ c->tid = next_tid(c->tid); │ local_unlock_irqrestore(&s->cpu_slab->lock, flags); 0.12 │ mov (%r14),%rax 0.03 │ add this_cpu_off,%rax │ arch_local_irq_restore(): │ return arch_irqs_disabled_flags(flags); │ }