Relayout members of struct kvm_vcpu and embedded structs to reduce its memory footprint. Not that it makes sense from a memory usage point of view (given how few of such objects get allocated), but this series achieves to make it consume two cachelines less, which should provide a micro-architectural net win. However, I wasn't able to see a noticeable difference running benchmarks within a guest VM -- the VMEXIT costs are likely still high enough to mask any gains. Pointers to other benchmark tools / tests are highly appreciated! I made sure that the mutex and spinlocks still stick to their own disjoint cachelines, so this series shouldn't cause any regression in that regard. Below is the high level pahole(1) diff. Most significant is the overall size change from 6688 to 6560 bytes, i.e. -128 bytes. --- kvm_vcpu.before 2023-02-13 14:13:49.919952154 +0100 +++ kvm_vcpu.after 2023-02-13 14:13:53.559952140 +0100 @@ -6,78 +6,60 @@ int vcpu_idx; /* 40 4 */ int ____srcu_idx; /* 44 4 */ int mode; /* 48 4 */ - - /* XXX 4 bytes hole, try to pack */ - + unsigned int guest_debug; /* 52 4 */ u64 requests; /* 56 8 */ /* --- cacheline 1 boundary (64 bytes) --- */ - long unsigned int guest_debug; /* 64 8 */ - struct mutex mutex; /* 72 32 */ - struct kvm_run * run; /* 104 8 */ - struct rcuwait wait; /* 112 8 */ - struct pid * pid; /* 120 8 */ + struct mutex mutex; /* 64 32 */ + struct kvm_run * run; /* 96 8 */ + struct rcuwait wait; /* 104 8 */ + struct pid * pid; /* 112 8 */ + sigset_t sigset; /* 120 8 */ /* --- cacheline 2 boundary (128 bytes) --- */ int sigset_active; /* 128 4 */ - - /* XXX 4 bytes hole, try to pack */ - - sigset_t sigset; /* 136 8 */ - unsigned int halt_poll_ns; /* 144 4 */ - bool valid_wakeup; /* 148 1 */ + unsigned int halt_poll_ns; /* 132 4 */ + bool valid_wakeup; /* 136 1 */ /* XXX 3 bytes hole, try to pack */ - int mmio_needed; /* 152 4 */ - int mmio_read_completed; /* 156 4 */ - int mmio_is_write; /* 160 4 */ - int mmio_cur_fragment; /* 164 4 */ - int mmio_nr_fragments; /* 168 4 */ - - /* XXX 4 bytes hole, try to pack */ - - struct kvm_mmio_fragment mmio_fragments[2]; /* 176 48 */ - /* --- cacheline 3 boundary (192 bytes) was 32 bytes ago --- */ + int mmio_needed; /* 140 4 */ + int mmio_read_completed; /* 144 4 */ + int mmio_is_write; /* 148 4 */ + int mmio_cur_fragment; /* 152 4 */ + int mmio_nr_fragments; /* 156 4 */ + struct kvm_mmio_fragment mmio_fragments[2]; /* 160 48 */ + /* --- cacheline 3 boundary (192 bytes) was 16 bytes ago --- */ struct { - u32 queued; /* 224 4 */ - - /* XXX 4 bytes hole, try to pack */ - - struct list_head queue; /* 232 16 */ - struct list_head done; /* 248 16 */ - /* --- cacheline 4 boundary (256 bytes) was 8 bytes ago --- */ - spinlock_t lock; /* 264 4 */ - } async_pf; /* 224 48 */ - - /* XXX last struct has 4 bytes of padding */ - + struct list_head queue; /* 208 16 */ + struct list_head done; /* 224 16 */ + spinlock_t lock; /* 240 4 */ + u32 queued; /* 244 4 */ + } async_pf; /* 208 40 */ struct { - bool in_spin_loop; /* 272 1 */ - bool dy_eligible; /* 273 1 */ - } spin_loop; /* 272 2 */ - bool preempted; /* 274 1 */ - bool ready; /* 275 1 */ + bool in_spin_loop; /* 248 1 */ + bool dy_eligible; /* 249 1 */ + } spin_loop; /* 248 2 */ + bool preempted; /* 250 1 */ + bool ready; /* 251 1 */ /* XXX 4 bytes hole, try to pack */ - struct kvm_vcpu_arch arch __attribute__((__aligned__(8))); /* 280 5208 */ - - /* XXX last struct has 6 bytes of padding */ - - /* --- cacheline 85 boundary (5440 bytes) was 48 bytes ago --- */ - struct kvm_vcpu_stat stat; /* 5488 1104 */ - /* --- cacheline 103 boundary (6592 bytes) --- */ - char stats_id[48]; /* 6592 48 */ - struct kvm_dirty_ring dirty_ring; /* 6640 32 */ + /* --- cacheline 4 boundary (256 bytes) --- */ + struct kvm_vcpu_arch arch __attribute__((__aligned__(8))); /* 256 5104 */ + /* --- cacheline 83 boundary (5312 bytes) was 48 bytes ago --- */ + struct kvm_vcpu_stat stat; /* 5360 1104 */ + /* --- cacheline 101 boundary (6464 bytes) --- */ + char stats_id[48]; /* 6464 48 */ + struct kvm_dirty_ring dirty_ring; /* 6512 32 */ /* XXX last struct has 4 bytes of padding */ - /* --- cacheline 104 boundary (6656 bytes) was 16 bytes ago --- */ - struct kvm_memory_slot * last_used_slot; /* 6672 8 */ - u64 last_used_slot_gen; /* 6680 8 */ - - /* size: 6688, cachelines: 105, members: 33 */ - /* sum members: 6669, holes: 5, sum holes: 19 */ - /* paddings: 3, sum paddings: 14 */ + /* --- cacheline 102 boundary (6528 bytes) was 16 bytes ago --- */ + struct kvm_memory_slot * last_used_slot; /* 6544 8 */ + u64 last_used_slot_gen; /* 6552 8 */ + + /* size: 6560, cachelines: 103, members: 33 */ + /* sum members: 6553, holes: 2, sum holes: 7 */ + /* paddings: 1, sum paddings: 4 */ /* forced alignments: 1, forced holes: 1, sum forced holes: 4 */ /* last cacheline: 32 bytes */ } __attribute__((__aligned__(8))); There are still holes left, i.e. still room for squeezing out a few more bytes. But I didn't want to destroy the grouping of members nor sprinkle random structs with __packed just for that. That said, we might still want to do it to save yet another cacheline (only 32 more bytes to go). Code wise it seems to save a few instructions as well, likely because of patch 2, simplifying some of the struct kvm_queued_exception users: $ size kvm-6.2-rc5{,+patch}/arch/x86/kvm/kvm.ko text data bss dec hex filename 806277 129095 1168 936540 e4a5c kvm-6.2-rc5/arch/x86/kvm/kvm.ko 806026 129063 1168 936257 e4941 kvm-6.2-rc5+patch/arch/x86/kvm/kvm.ko $ bloat-o-meter kvm-6.2-rc5{,+patch}/arch/x86/kvm/kvm.ko add/remove: 0/2 grow/shrink: 7/11 up/down: 102/-340 (-238) Function old new delta kvm_vcpu_ioctl_x86_set_vcpu_events 664 707 +43 kvm_multiple_exception 581 613 +32 kvm_inject_exception 153 161 +8 kvm_vcpu_ioctl_x86_get_vcpu_events 510 517 +7 __UNIQUE_ID_vermagic166 67 73 +6 kvm_inject_page_fault 108 113 +5 kvm_mmu_x86_module_init 174 175 +1 kvm_vcpu_ready_for_interrupt_injection 134 131 -3 kvm_vcpu_ioctl 1717 1714 -3 kvm_vcpu_compat_ioctl 274 271 -3 kvm_sigset_activate 53 50 -3 kvm_can_do_async_pf 150 147 -3 kvm_mtrr_get_msr 290 286 -4 kvm_deliver_exception_payload 139 135 -4 kvm_arch_vcpu_ioctl_set_guest_debug 662 657 -5 x86_emulate_instruction 1952 1945 -7 kvm_vcpu_reset 1382 1375 -7 __pfx_kvm_deliver_exception_payload.part 16 - -16 kvm_deliver_exception_payload.part 122 - -122 kvm_arch_vcpu_ioctl_run 7782 7622 -160 Total: Before=635427, After=635189, chg -0.04% Please apply! Thanks, Mathias Krause (5): KVM: x86: Shrink struct kvm_pmu KVM: x86: Shrink struct kvm_queued_exception KVM: Shrink struct kvm_mmu_memory_cache KVM: x86: Shrink struct kvm_vcpu_arch KVM: Shrink struct kvm_vcpu arch/x86/include/asm/kvm_host.h | 30 +++++++++++++++--------------- include/linux/kvm_host.h | 6 +++--- include/linux/kvm_types.h | 2 +- 3 files changed, 19 insertions(+), 19 deletions(-) -- 2.39.1