On Wed, Nov 29, 2017 at 11:38:25AM -0500, Serhii Popovych wrote: > When serving multiple resize requests following could happen: > > CPU0 CPU1 > ---- ---- > kvm_vm_ioctl_resize_hpt_prepare(1); > -> schedule_work() > /* system_rq might be busy: delay */ > kvm_vm_ioctl_resize_hpt_prepare(2); > mutex_lock(); > if (resize) { > ... > release_hpt_resize(); > } > ... resize_hpt_prepare_work() > -> schedule_work() { > mutex_unlock() /* resize->kvm could be wrong */ > struct kvm *kvm = resize->kvm; > > mutex_lock(&kvm->lock); <<<< UAF > ... > } > > Since scheduled work could be delayed (e.g. when system is busy) we > another resize request with different order could be promoted by > kvm_vm_ioctl_resize_hpt_prepare() and previous request could be > freed right before resize_hpt_prepare_work() begins execution or > right under mutex_lock() when it is executed in parallel with ioctl. > > In both cases we get use after free in point marked with UAF on the > diagram above. > > To prevent this from happening we must not release previous request > immediately in ioctl instead postponing this to resize_hpt_prepare_work(). > > Make resize_hpt_release() generic: we use it in more cases. > > This fixes following or similar host kernel crash message: > > [ 635.277361] Unable to handle kernel paging request for data at address 0x00000000 > [ 635.277438] Faulting instruction address: 0xc00000000052f568 > [ 635.277446] Oops: Kernel access of bad area, sig: 11 [#1] > [ 635.277451] SMP NR_CPUS=2048 NUMA PowerNV > [ 635.277470] Modules linked in: xt_CHECKSUM iptable_mangle ipt_MASQUERADE > nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4 > nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge stp llc > ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter nfsv3 nfs_acl nfs > lockd grace fscache kvm_hv kvm rpcrdma sunrpc ib_isert iscsi_target_mod ib_iser libiscsi > scsi_transport_iscsi ib_srpt target_core_mod ext4 ib_srp scsi_transport_srp > ib_ipoib mbcache jbd2 rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ocrdma(T) > ib_core ses enclosure scsi_transport_sas sg shpchp leds_powernv ibmpowernv i2c_opal > i2c_core powernv_rng ipmi_powernv ipmi_devintf ipmi_msghandler ip_tables xfs > libcrc32c sr_mod sd_mod cdrom lpfc nvme_fc(T) nvme_fabrics nvme_core ipr nvmet_fc(T) > tg3 nvmet libata be2net crc_t10dif crct10dif_generic scsi_transport_fc ptp scsi_tgt > pps_core crct10dif_common dm_mirror dm_region_hash dm_log dm_mod > [ 635.278687] CPU: 40 PID: 749 Comm: kworker/40:1 Tainted: G > ------------ T 3.10.0.bz1510771+ #1 > [ 635.278782] Workqueue: events resize_hpt_prepare_work [kvm_hv] > [ 635.278851] task: c0000007e6840000 ti: c0000007e9180000 task.ti: c0000007e9180000 > [ 635.278919] NIP: c00000000052f568 LR: c0000000009ea310 CTR: c0000000009ea4f0 > [ 635.278988] REGS: c0000007e91837f0 TRAP: 0300 Tainted: G > ------------ T (3.10.0.bz1510771+) > [ 635.279077] MSR: 9000000100009033 <SF,HV,EE,ME,IR,DR,RI,LE> CR: 24002022 XER: > 00000000 > [ 635.279248] CFAR: c000000000009368 DAR: 0000000000000000 DSISR: 40000000 SOFTE: 1 > GPR00: c0000000009ea310 c0000007e9183a70 c000000001250b00 c0000007e9183b10 > GPR04: 0000000000000000 0000000000000000 c0000007e9183650 0000000000000000 > GPR08: c0000007ffff7b80 00000000ffffffff 0000000080000028 d00000000d2529a0 > GPR12: 0000000000002200 c000000007b56800 c000000000120028 c0000007f135bb40 > GPR16: 0000000000000000 c000000005c1e018 c000000005c1e018 0000000000000000 > GPR20: 0000000000000001 c0000000011bf778 0000000000000001 fffffffffffffef7 > GPR24: 0000000000000000 c000000f1e262e50 0000000000000002 c0000007e9180000 > GPR28: c000000f1e262e4c c000000f1e262e50 0000000000000000 c0000007e9183b10 > [ 635.280149] NIP [c00000000052f568] __list_add+0x38/0x110 > [ 635.280197] LR [c0000000009ea310] __mutex_lock_slowpath+0xe0/0x2c0 > [ 635.280253] Call Trace: > [ 635.280277] [c0000007e9183af0] [c0000000009ea310] __mutex_lock_slowpath+0xe0/0x2c0 > [ 635.280356] [c0000007e9183b70] [c0000000009ea554] mutex_lock+0x64/0x70 > [ 635.280426] [c0000007e9183ba0] [d00000000d24da04] > resize_hpt_prepare_work+0xe4/0x1c0 [kvm_hv] > [ 635.280507] [c0000007e9183c40] [c000000000113c0c] process_one_work+0x1dc/0x680 > [ 635.280587] [c0000007e9183ce0] [c000000000114250] worker_thread+0x1a0/0x520 > [ 635.280655] [c0000007e9183d80] [c00000000012010c] kthread+0xec/0x100 > [ 635.280724] [c0000007e9183e30] [c00000000000a4b8] ret_from_kernel_thread+0x5c/0xa4 > [ 635.280814] Instruction dump: > [ 635.280880] 7c0802a6 fba1ffe8 fbc1fff0 7cbd2b78 fbe1fff8 7c9e2378 7c7f1b78 > f8010010 > [ 635.281099] f821ff81 e8a50008 7fa52040 40de00b8 <e8be0000> 7fbd2840 40de008c > 7fbff040 > [ 635.281324] ---[ end trace b628b73449719b9d ]--- > > Signed-off-by: Serhii Popovych <spopovyc@xxxxxxxxxx> Reviewed-by: David Gibson <david@xxxxxxxxxxxxxxxxxxxxx> > --- > arch/powerpc/kvm/book3s_64_mmu_hv.c | 45 ++++++++++++++++++++++++++----------- > 1 file changed, 32 insertions(+), 13 deletions(-) > > diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c > index 3e9abd9..690f061 100644 > --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c > +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c > @@ -1417,15 +1417,18 @@ static void resize_hpt_pivot(struct kvm_resize_hpt *resize) > > static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize) > { > - BUG_ON(kvm->arch.resize_hpt != resize); > + BUG_ON(!mutex_is_locked(&kvm->lock)); > > if (!resize) > return; > > - kvmppc_free_hpt(&resize->hpt); > + if (resize->error != -EBUSY) { > + kvmppc_free_hpt(&resize->hpt); > + kfree(resize); > + } > > - kvm->arch.resize_hpt = NULL; > - kfree(resize); > + if (kvm->arch.resize_hpt == resize) > + kvm->arch.resize_hpt = NULL; > } > > static void resize_hpt_prepare_work(struct work_struct *work) > @@ -1434,24 +1437,40 @@ static void resize_hpt_prepare_work(struct work_struct *work) > struct kvm_resize_hpt, > work); > struct kvm *kvm = resize->kvm; > - int err; > + int err = 0; > > BUG_ON(resize->error != -EBUSY); > > - resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", > - resize->order); > + mutex_lock(&kvm->lock); > + > + /* Request is still current? */ > + if (kvm->arch.resize_hpt == resize) { > + /* We may request large allocations here: > + * do not sleep with kvm->lock held for a while. > + */ > + mutex_unlock(&kvm->lock); > > - err = resize_hpt_allocate(resize); > + resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n", > + resize->order); > > - /* We have strict assumption about -EBUSY > - * when preparing for HPT resize. > - */ > - BUG_ON(err == -EBUSY); > + err = resize_hpt_allocate(resize); > > - mutex_lock(&kvm->lock); > + /* We have strict assumption about -EBUSY > + * when preparing for HPT resize. > + */ > + BUG_ON(err == -EBUSY); > + > + mutex_lock(&kvm->lock); > + /* It is possible that kvm->arch.resize_hpt != resize > + * after we grab kvm->lock again. > + */ > + } > > resize->error = err; > > + if (kvm->arch.resize_hpt != resize) > + resize_hpt_release(kvm, resize); > + > mutex_unlock(&kvm->lock); > } > -- David Gibson | I'll have my music baroque, and my code david AT gibson.dropbear.id.au | minimalist, thank you. NOT _the_ _other_ | _way_ _around_! http://www.ozlabs.org/~dgibson
Attachment:
signature.asc
Description: PGP signature