I have tested that I can reproduce with the most recent kvm-x86. [ 495.011678] ================================================================== [ 495.019933] BUG: KASAN: vmalloc-out-of-bounds in hugepage_has_attrs+0x1ad/0x1d0 [kvm] [ 495.028984] Read of size 4 at addr ffa000000057c008 by task private_mem_con/16295 [ 495.039536] CPU: 43 PID: 16295 Comm: private_mem_con Not tainted 6.8.0-rc4diagnostic+ #1 [ 495.049231] Hardware name: Oracle Corporation ORACLE SERVER X9-2c/TLA,MB TRAY,X9-2c, BIOS 66090600 08/23/2023 [ 495.061126] Call Trace: [ 495.064157] <TASK> [ 495.066600] dump_stack_lvl+0x47/0x60 [ 495.070789] print_report+0xcf/0x640 [ 495.074922] ? __pfx__raw_spin_lock_irqsave+0x10/0x10 [ 495.080886] ? __pfx_radix_tree_node_rcu_free+0x10/0x10 [ 495.086933] ? hugepage_has_attrs+0x1ad/0x1d0 [kvm] [ 495.092544] kasan_report+0xb0/0xe0 [ 495.096546] ? hugepage_has_attrs+0x1ad/0x1d0 [kvm] [ 495.102212] hugepage_has_attrs+0x1ad/0x1d0 [kvm] [ 495.107641] kvm_arch_post_set_memory_attributes+0x2b5/0xa80 [kvm] [ 495.115052] kvm_vm_ioctl+0x215a/0x3630 [kvm] [ 495.120236] ? kvm_emulate_hypercall+0x1b0/0xc60 [kvm] [ 495.126482] ? __pfx_kvm_vm_ioctl+0x10/0x10 [kvm] [ 495.132300] ? vmx_vmexit+0x72/0xd0 [kvm_intel] [ 495.137655] ? vmx_vmexit+0x9e/0xd0 [kvm_intel] [ 495.143080] ? vmx_vcpu_run+0xb52/0x1df0 [kvm_intel] [ 495.148809] ? user_return_notifier_register+0x23/0x120 [ 495.155168] ? vmx_handle_exit+0x5cb/0x1840 [kvm_intel] [ 495.161494] ? get_cpu_vendor+0x151/0x1c0 [ 495.166234] ? vcpu_run+0x1ad0/0x4fe0 [kvm] [ 495.171404] ? __pfx_vmx_vcpu_put+0x10/0x10 [kvm_intel] [ 495.177757] ? restore_fpregs_from_fpstate+0x91/0x150 [ 495.183807] ? __pfx_restore_fpregs_from_fpstate+0x10/0x10 [ 495.190241] ? kvm_arch_vcpu_put+0x50d/0x710 [kvm] [ 495.195810] ? mutex_unlock+0x7f/0xd0 [ 495.200063] ? __pfx_mutex_unlock+0x10/0x10 [ 495.205114] ? kfree+0xbc/0x270 [ 495.208824] ? __pfx_do_vfs_ioctl+0x10/0x10 [ 495.213685] ? __pfx_kvm_vcpu_ioctl+0x10/0x10 [kvm] [ 495.219681] ? rcu_core+0x3d0/0x1af0 [ 495.223801] ? __pfx_ioctl_has_perm.constprop.0.isra.0+0x10/0x10 [ 495.230794] ? __x64_sys_nanosleep_time32+0x62/0x240 [ 495.243211] ? __pfx_rcu_core+0x10/0x10 [ 495.253527] ? lapic_next_deadline+0x27/0x30 [ 495.264294] ? clockevents_program_event+0x1cd/0x290 [ 495.276271] ? security_file_ioctl+0x64/0xa0 [ 495.288009] __x64_sys_ioctl+0x12f/0x1a0 [ 495.298340] do_syscall_64+0x58/0x120 [ 495.309360] entry_SYSCALL_64_after_hwframe+0x6e/0x76 [ 495.321567] RIP: 0033:0x7f1a24c397cb [ 495.332094] Code: 73 01 c3 48 8b 0d bd 56 38 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa b8 10 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 8d 56 38 00 f7 d8 64 89 01 48 [ 495.364168] RSP: 002b:00007f1a241ffa08 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 495.379294] RAX: ffffffffffffffda RBX: 0000000100000000 RCX: 00007f1a24c397cb [ 495.393694] RDX: 00007f1a241ffa50 RSI: 000000004020aed2 RDI: 0000000000000004 [ 495.408770] RBP: 0000000000ae68c0 R08: 000000000041b260 R09: 000000000000000c [ 495.423691] R10: 0000000000000001 R11: 0000000000000246 R12: 00007f1a25b85000 [ 495.437640] R13: 0000000000ae42a0 R14: 0000000000000000 R15: 0000000000000001 [ 495.452119] </TASK> [ 495.467231] The buggy address belongs to the virtual mapping at [ffa000000057c000, ffa000000057e000) created by: kvm_arch_prepare_memory_region+0x21c/0x770 [kvm] [ 495.508638] The buggy address belongs to the physical page: [ 495.520687] page:00000000fd0772bd refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x1fd557 [ 495.537219] memcg:ff11000371954f82 [ 495.547263] flags: 0x200000000000000(node=0|zone=2) [ 495.558820] page_type: 0xffffffff() [ 495.569018] raw: 0200000000000000 0000000000000000 dead000000000122 0000000000000000 [ 495.583833] raw: 0000000000000000 0000000000000000 00000001ffffffff ff11000371954f82 [ 495.598872] page dumped because: kasan: bad access detected [ 495.618236] Memory state around the buggy address: [ 495.630053] ffa000000057bf00: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 [ 495.644646] ffa000000057bf80: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 [ 495.659130] >ffa000000057c000: 00 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 [ 495.673448] ^ [ 495.683659] ffa000000057c080: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 [ 495.700596] ffa000000057c100: f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 f8 [ 495.716978] ================================================================== I cannot reproduce with this bugfix patch. Even the 1st range at line 116 (0 - PAGE_SIZE) can reproduce the issue. 112 struct { 113 uint64_t offset; 114 uint64_t size; 115 } static const test_ranges[] = { 116 GUEST_STAGE(0, PAGE_SIZE), The memslot id=10 has: - base_gfn=1048576 - npages=1024 Therefore, "level - 1 will not contain an entry for each GFN at page size level". If aligned, we expect lpage_info[0] to have 512 elements. 1GB: lpage_info[1] has 1 element 2MB: lpage_info[0] has 2 elemtnts Issue happens when guest_map_shared() the 1-page (1048576 to 1048577). So far the comments are conflicting. I agree "It is a little ambiguous whether the unaligned tail page should be expected to have KVM_LPAGE_MIXED_FLAG set." The below says KVM_LPAGE_MIXED_FLAG and lower bits are different (although functioning the same) ... /* * The most significant bit in disallow_lpage tracks whether or not memory * attributes are mixed, i.e. not identical for all gfns at the current level. * The lower order bits are used to refcount other cases where a hugepage is * disallowed, e.g. if KVM has shadow a page table at the gfn. */ #define KVM_LPAGE_MIXED_FLAG BIT(31) ... while the below implies the they can be used as same. /* * Skip mixed tracking if the aligned gfn isn't covered * by the memslot, KVM can't use a hugepage due to the * misaligned address regardless of memory attributes. */ BTW, the number of entries in "struct kvm_arch_memory_slot" is not cached. This brings some obstables when analyzing vmcore :) Dongli Zhang On 3/12/24 10:33, Rick Edgecombe wrote: > Fix KVM_SET_MEMORY_ATTRIBUTES to not overflow lpage_info array and trigger > KASAN splat, as seen in the private_mem_conversions_test selftest. > > When memory attributes are set on a GFN range, that range will have > specific properties applied to the TDP. A huge page cannot be used when > the attributes are inconsistent, so they are disabled for those the > specific huge pages. For internal KVM reasons, huge pages are also not > allowed to span adjacent memslots regardless of whether the backing memory > could be mapped as huge. > > What GFNs support which huge page sizes is tracked by an array of arrays > 'lpage_info' on the memslot, of ‘kvm_lpage_info’ structs. Each index of > lpage_info contains a vmalloc allocated array of these for a specific > supported page size. The kvm_lpage_info denotes whether a specific huge > page (GFN and page size) on the memslot is supported. These arrays include > indices for unaligned head and tail huge pages. > > Preventing huge pages from spanning adjacent memslot is covered by > incrementing the count in head and tail kvm_lpage_info when the memslot is > allocated, but disallowing huge pages for memory that has mixed attributes > has to be done in a more complicated way. During the > KVM_SET_MEMORY_ATTRIBUTES ioctl KVM updates lpage_info for each memslot in > the range that has mismatched attributes. KVM does this a memslot at a > time, and marks a special bit, KVM_LPAGE_MIXED_FLAG, in the kvm_lpage_info > for any huge page. This bit is essentially a permanently elevated count. > So huge pages will not be mapped for the GFN at that page size if the > count is elevated in either case: a huge head or tail page unaligned to > the memslot or if KVM_LPAGE_MIXED_FLAG is set because it has mixed > attributes. > > To determine whether a huge page has consistent attributes, the > KVM_SET_MEMORY_ATTRIBUTES operation checks an xarray to make sure it > consistently has the incoming attribute. Since level - 1 huge pages are > aligned to level huge pages, it employs an optimization. As long as the > level - 1 huge pages are checked first, it can just check these and assume > that if each level - 1 huge page contained within the level sized huge > page is not mixed, then the level size huge page is not mixed. This > optimization happens in the helper hugepage_has_attrs(). > > Unfortunately, although the kvm_lpage_info array representing page size > 'level' will contain an entry for an unaligned tail page of size level, > the array for level - 1 will not contain an entry for each GFN at page > size level. The level - 1 array will only contain an index for any > unaligned region covered by level - 1 huge page size, which can be a > smaller region. So this causes the optimization to overflow the level - 1 > kvm_lpage_info and perform a vmalloc out of bounds read. > > In some cases of head and tail pages where an overflow could happen, > callers skip the operation completely as KVM_LPAGE_MIXED_FLAG is not > required to prevent huge pages as discussed earlier. But for memslots that > are smaller than the 1GB page size, it does call hugepage_has_attrs(). The > issue can be observed simply by compiling the kernel with > CONFIG_KASAN_VMALLOC and running the selftest > “private_mem_conversions_test”, which produces the output like the > following: > > BUG: KASAN: vmalloc-out-of-bounds in hugepage_has_attrs+0x7e/0x110 > Read of size 4 at addr ffffc900000a3008 by task private_mem_con/169 > Call Trace: > dump_stack_lvl > print_report > ? __virt_addr_valid > ? hugepage_has_attrs > ? hugepage_has_attrs > kasan_report > ? hugepage_has_attrs > hugepage_has_attrs > kvm_arch_post_set_memory_attributes > kvm_vm_ioctl > > It is a little ambiguous whether the unaligned tail page should be > expected to have KVM_LPAGE_MIXED_FLAG set. It is not functionally > required, as the unaligned tail pages will already have their > kvm_lpage_info count incremented. The comments imply not setting it on > unaligned head pages is intentional, so fix the callers to skip trying to > set KVM_LPAGE_MIXED_FLAG in this case, and in doing so not call > hugepage_has_attrs(). > > Also rename hugepage_has_attrs() to __slot_hugepage_has_attrs() because it > is a delicate function that should not be widely used, and only is valid > for ranges covered by the passed slot. > > Cc: stable@xxxxxxxxxxxxxxx > Fixes: 90b4fe17981e ("KVM: x86: Disallow hugepages when memory attributes are mixed") > Signed-off-by: Rick Edgecombe <rick.p.edgecombe@xxxxxxxxx> > --- > Hi, > > I added cc stable because I didn't rule out a way to trigger a non-kasan > crash from userspace on non-x86. But of course this is a testing only > feature at this point and shouldn't cause a crash for normal users. > > Testing was just the upstream selftests and a TDX guest boot on out of tree > branch. > > Rick > --- > arch/x86/kvm/mmu/mmu.c | 13 +++++++------ > 1 file changed, 7 insertions(+), 6 deletions(-) > > diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c > index 0544700ca50b..4dac778b2520 100644 > --- a/arch/x86/kvm/mmu/mmu.c > +++ b/arch/x86/kvm/mmu/mmu.c > @@ -7337,8 +7337,8 @@ static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn, > lpage_info_slot(gfn, slot, level)->disallow_lpage |= KVM_LPAGE_MIXED_FLAG; > } > > -static bool hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot, > - gfn_t gfn, int level, unsigned long attrs) > +static bool __slot_hugepage_has_attrs(struct kvm *kvm, struct kvm_memory_slot *slot, > + gfn_t gfn, int level, unsigned long attrs) > { > const unsigned long start = gfn; > const unsigned long end = start + KVM_PAGES_PER_HPAGE(level); > @@ -7388,8 +7388,9 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, > * by the memslot, KVM can't use a hugepage due to the > * misaligned address regardless of memory attributes. > */ > - if (gfn >= slot->base_gfn) { > - if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) > + if (gfn >= slot->base_gfn && > + gfn + nr_pages <= slot->base_gfn + slot->npages) { > + if (__slot_hugepage_has_attrs(kvm, slot, gfn, level, attrs)) > hugepage_clear_mixed(slot, gfn, level); > else > hugepage_set_mixed(slot, gfn, level); > @@ -7411,7 +7412,7 @@ bool kvm_arch_post_set_memory_attributes(struct kvm *kvm, > */ > if (gfn < range->end && > (gfn + nr_pages) <= (slot->base_gfn + slot->npages)) { > - if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) > + if (__slot_hugepage_has_attrs(kvm, slot, gfn, level, attrs)) > hugepage_clear_mixed(slot, gfn, level); > else > hugepage_set_mixed(slot, gfn, level); > @@ -7449,7 +7450,7 @@ void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm, > for (gfn = start; gfn < end; gfn += nr_pages) { > unsigned long attrs = kvm_get_memory_attributes(kvm, gfn); > > - if (hugepage_has_attrs(kvm, slot, gfn, level, attrs)) > + if (__slot_hugepage_has_attrs(kvm, slot, gfn, level, attrs)) > hugepage_clear_mixed(slot, gfn, level); > else > hugepage_set_mixed(slot, gfn, level); > > base-commit: 5abf6dceb066f2b02b225fd561440c98a8062681