Re: [PATCH 3/4] KVM: PPC: Book3S HV: Fix use after free in case of multiple resize requests

David Gibson <david@xxxxxxxxxxxxxxxxxxxxx> · Thu, 30 Nov 2017 14:51:12 +1100



On Wed, Nov 29, 2017 at 11:38:25AM -0500, Serhii Popovych wrote:
> When serving multiple resize requests following could happen:
> 
>     CPU0                                    CPU1
>     ----                                    ----
>     kvm_vm_ioctl_resize_hpt_prepare(1);
>       -> schedule_work()
>                                             /* system_rq might be busy: delay */
>     kvm_vm_ioctl_resize_hpt_prepare(2);
>       mutex_lock();
>       if (resize) {
>          ...
>          release_hpt_resize();
>       }
>       ...                                   resize_hpt_prepare_work()
>       -> schedule_work()                    {
>       mutex_unlock()                           /* resize->kvm could be wrong */
>                                                struct kvm *kvm = resize->kvm;
> 
>                                                mutex_lock(&kvm->lock);   <<<< UAF
>                                                ...
>                                             }
> 
> Since scheduled work could be delayed (e.g. when system is busy) we
> another resize request with different order could be promoted by
> kvm_vm_ioctl_resize_hpt_prepare() and previous request could be
> freed right before resize_hpt_prepare_work() begins execution or
> right under mutex_lock() when it is executed in parallel with ioctl.
> 
> In both cases we get use after free in point marked with UAF on the
> diagram above.
> 
> To prevent this from happening we must not release previous request
> immediately in ioctl instead postponing this to resize_hpt_prepare_work().
> 
> Make resize_hpt_release() generic: we use it in more cases.
> 
> This fixes following or similar host kernel crash message:
> 
> [  635.277361] Unable to handle kernel paging request for data at address 0x00000000
> [  635.277438] Faulting instruction address: 0xc00000000052f568
> [  635.277446] Oops: Kernel access of bad area, sig: 11 [#1]
> [  635.277451] SMP NR_CPUS=2048 NUMA PowerNV
> [  635.277470] Modules linked in: xt_CHECKSUM iptable_mangle ipt_MASQUERADE
> nf_nat_masquerade_ipv4 iptable_nat nf_nat_ipv4 nf_nat nf_conntrack_ipv4
> nf_defrag_ipv4 xt_conntrack nf_conntrack ipt_REJECT nf_reject_ipv4 tun bridge stp llc
> ebtable_filter ebtables ip6table_filter ip6_tables iptable_filter nfsv3 nfs_acl nfs
> lockd grace fscache kvm_hv kvm rpcrdma sunrpc ib_isert iscsi_target_mod ib_iser libiscsi
> scsi_transport_iscsi ib_srpt target_core_mod ext4 ib_srp scsi_transport_srp
> ib_ipoib mbcache jbd2 rdma_ucm ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm ocrdma(T)
> ib_core ses enclosure scsi_transport_sas sg shpchp leds_powernv ibmpowernv i2c_opal
> i2c_core powernv_rng ipmi_powernv ipmi_devintf ipmi_msghandler ip_tables xfs
> libcrc32c sr_mod sd_mod cdrom lpfc nvme_fc(T) nvme_fabrics nvme_core ipr nvmet_fc(T)
> tg3 nvmet libata be2net crc_t10dif crct10dif_generic scsi_transport_fc ptp scsi_tgt
> pps_core crct10dif_common dm_mirror dm_region_hash dm_log dm_mod
> [  635.278687] CPU: 40 PID: 749 Comm: kworker/40:1 Tainted: G
> ------------ T 3.10.0.bz1510771+ #1
> [  635.278782] Workqueue: events resize_hpt_prepare_work [kvm_hv]
> [  635.278851] task: c0000007e6840000 ti: c0000007e9180000 task.ti: c0000007e9180000
> [  635.278919] NIP: c00000000052f568 LR: c0000000009ea310 CTR: c0000000009ea4f0
> [  635.278988] REGS: c0000007e91837f0 TRAP: 0300   Tainted: G
> ------------ T  (3.10.0.bz1510771+)
> [  635.279077] MSR: 9000000100009033 <SF,HV,EE,ME,IR,DR,RI,LE>  CR: 24002022  XER:
> 00000000
> [  635.279248] CFAR: c000000000009368 DAR: 0000000000000000 DSISR: 40000000 SOFTE: 1
> GPR00: c0000000009ea310 c0000007e9183a70 c000000001250b00 c0000007e9183b10
> GPR04: 0000000000000000 0000000000000000 c0000007e9183650 0000000000000000
> GPR08: c0000007ffff7b80 00000000ffffffff 0000000080000028 d00000000d2529a0
> GPR12: 0000000000002200 c000000007b56800 c000000000120028 c0000007f135bb40
> GPR16: 0000000000000000 c000000005c1e018 c000000005c1e018 0000000000000000
> GPR20: 0000000000000001 c0000000011bf778 0000000000000001 fffffffffffffef7
> GPR24: 0000000000000000 c000000f1e262e50 0000000000000002 c0000007e9180000
> GPR28: c000000f1e262e4c c000000f1e262e50 0000000000000000 c0000007e9183b10
> [  635.280149] NIP [c00000000052f568] __list_add+0x38/0x110
> [  635.280197] LR [c0000000009ea310] __mutex_lock_slowpath+0xe0/0x2c0
> [  635.280253] Call Trace:
> [  635.280277] [c0000007e9183af0] [c0000000009ea310] __mutex_lock_slowpath+0xe0/0x2c0
> [  635.280356] [c0000007e9183b70] [c0000000009ea554] mutex_lock+0x64/0x70
> [  635.280426] [c0000007e9183ba0] [d00000000d24da04]
> resize_hpt_prepare_work+0xe4/0x1c0 [kvm_hv]
> [  635.280507] [c0000007e9183c40] [c000000000113c0c] process_one_work+0x1dc/0x680
> [  635.280587] [c0000007e9183ce0] [c000000000114250] worker_thread+0x1a0/0x520
> [  635.280655] [c0000007e9183d80] [c00000000012010c] kthread+0xec/0x100
> [  635.280724] [c0000007e9183e30] [c00000000000a4b8] ret_from_kernel_thread+0x5c/0xa4
> [  635.280814] Instruction dump:
> [  635.280880] 7c0802a6 fba1ffe8 fbc1fff0 7cbd2b78 fbe1fff8 7c9e2378 7c7f1b78
> f8010010
> [  635.281099] f821ff81 e8a50008 7fa52040 40de00b8 <e8be0000> 7fbd2840 40de008c
> 7fbff040
> [  635.281324] ---[ end trace b628b73449719b9d ]---
> 
> Signed-off-by: Serhii Popovych <spopovyc@xxxxxxxxxx>

Reviewed-by: David Gibson <david@xxxxxxxxxxxxxxxxxxxxx>

> ---
>  arch/powerpc/kvm/book3s_64_mmu_hv.c | 45 ++++++++++++++++++++++++++-----------
>  1 file changed, 32 insertions(+), 13 deletions(-)
> 
> diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> index 3e9abd9..690f061 100644
> --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c
> @@ -1417,15 +1417,18 @@ static void resize_hpt_pivot(struct kvm_resize_hpt *resize)
>  
>  static void resize_hpt_release(struct kvm *kvm, struct kvm_resize_hpt *resize)
>  {
> -	BUG_ON(kvm->arch.resize_hpt != resize);
> +	BUG_ON(!mutex_is_locked(&kvm->lock));
>  
>  	if (!resize)
>  		return;
>  
> -	kvmppc_free_hpt(&resize->hpt);
> +	if (resize->error != -EBUSY) {
> +		kvmppc_free_hpt(&resize->hpt);
> +		kfree(resize);
> +	}
>  
> -	kvm->arch.resize_hpt = NULL;
> -	kfree(resize);
> +	if (kvm->arch.resize_hpt == resize)
> +		kvm->arch.resize_hpt = NULL;
>  }
>  
>  static void resize_hpt_prepare_work(struct work_struct *work)
> @@ -1434,24 +1437,40 @@ static void resize_hpt_prepare_work(struct work_struct *work)
>  						     struct kvm_resize_hpt,
>  						     work);
>  	struct kvm *kvm = resize->kvm;
> -	int err;
> +	int err = 0;
>  
>  	BUG_ON(resize->error != -EBUSY);
>  
> -	resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n",
> -			 resize->order);
> +	mutex_lock(&kvm->lock);
> +
> +	/* Request is still current? */
> +	if (kvm->arch.resize_hpt == resize) {
> +		/* We may request large allocations here:
> +		 * do not sleep with kvm->lock held for a while.
> +		 */
> +		mutex_unlock(&kvm->lock);
>  
> -	err = resize_hpt_allocate(resize);
> +		resize_hpt_debug(resize, "resize_hpt_prepare_work(): order = %d\n",
> +				 resize->order);
>  
> -	/* We have strict assumption about -EBUSY
> -	 * when preparing for HPT resize.
> -	 */
> -	BUG_ON(err == -EBUSY);
> +		err = resize_hpt_allocate(resize);
>  
> -	mutex_lock(&kvm->lock);
> +		/* We have strict assumption about -EBUSY
> +		 * when preparing for HPT resize.
> +		 */
> +		BUG_ON(err == -EBUSY);
> +
> +		mutex_lock(&kvm->lock);
> +		/* It is possible that kvm->arch.resize_hpt != resize
> +		 * after we grab kvm->lock again.
> +		 */
> +	}
>  
>  	resize->error = err;
>  
> +	if (kvm->arch.resize_hpt != resize)
> +		resize_hpt_release(kvm, resize);
> +
>  	mutex_unlock(&kvm->lock);
>  }
>  

-- 
David Gibson			| I'll have my music baroque, and my code
david AT gibson.dropbear.id.au	| minimalist, thank you.  NOT _the_ _other_
				| _way_ _around_!
http://www.ozlabs.org/~dgibson
Attachment:
signature.asc

Description: PGP signature