On Thu, Jan 16, 2025 at 02:23:24PM +0800, Binbin Wu wrote: > > > > On 1/13/2025 10:12 AM, Yan Zhao wrote: > [...] > > + > > /* TDH.PHYMEM.PAGE.RECLAIM is allowed only when destroying the TD. */ > > static int __tdx_reclaim_page(hpa_t pa) > > { > > @@ -979,6 +999,14 @@ fastpath_t tdx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit) > > return EXIT_FASTPATH_NONE; > > } > > + /* > > + * Wait until retry of SEPT-zap-related SEAMCALL completes before > > + * allowing vCPU entry to avoid contention with tdh_vp_enter() and > > + * TDCALLs. > > + */ > > + if (unlikely(READ_ONCE(to_kvm_tdx(vcpu->kvm)->wait_for_sept_zap))) > > + return EXIT_FASTPATH_EXIT_HANDLED; > > + > > trace_kvm_entry(vcpu, force_immediate_exit); > > if (pi_test_on(&tdx->pi_desc)) { > > @@ -1647,15 +1675,23 @@ static int tdx_sept_drop_private_spte(struct kvm *kvm, gfn_t gfn, > > if (KVM_BUG_ON(!is_hkid_assigned(kvm_tdx), kvm)) > > return -EINVAL; > > - do { > > + /* > > + * When zapping private page, write lock is held. So no race condition > > + * with other vcpu sept operation. > > + * Race with TDH.VP.ENTER due to (0-step mitigation) and Guest TDCALLs. > > + */ > > + err = tdh_mem_page_remove(kvm_tdx->tdr_pa, gpa, tdx_level, &entry, > > + &level_state); > > + if ((err & TDX_OPERAND_BUSY)) { > > It is not safe to use "err & TDX_OPERAND_BUSY". > E.g., if the error is TDX_EPT_WALK_FAILED, "err & TDX_OPERAND_BUSY" will be true. > > Maybe you can add a helper to check it. > > staticinlinebooltdx_operand_busy(u64err) > { > return(err &TDX_SEAMCALL_STATUS_MASK) ==TDX_OPERAND_BUSY; > } > Good catch! Thanks! > > > /* > > - * When zapping private page, write lock is held. So no race > > - * condition with other vcpu sept operation. Race only with > > - * TDH.VP.ENTER. > > + * The second retry is expected to succeed after kicking off all > > + * other vCPUs and prevent them from invoking TDH.VP.ENTER. > > */ > > + tdx_no_vcpus_enter_start(kvm); > > err = tdh_mem_page_remove(kvm_tdx->tdr_pa, gpa, tdx_level, &entry, > > &level_state); > > - } while (unlikely(err == TDX_ERROR_SEPT_BUSY)); > > + tdx_no_vcpus_enter_stop(kvm); > > + } > > if (unlikely(kvm_tdx->state != TD_STATE_RUNNABLE && > > err == (TDX_EPT_WALK_FAILED | TDX_OPERAND_ID_RCX))) { > > @@ -1726,8 +1762,12 @@ static int tdx_sept_zap_private_spte(struct kvm *kvm, gfn_t gfn, > > WARN_ON_ONCE(level != PG_LEVEL_4K); > > err = tdh_mem_range_block(kvm_tdx->tdr_pa, gpa, tdx_level, &entry, &level_state); > > - if (unlikely(err == TDX_ERROR_SEPT_BUSY)) > > - return -EAGAIN; > > + if (unlikely(err & TDX_OPERAND_BUSY)) { > Ditto. > > > + /* After no vCPUs enter, the second retry is expected to succeed */ > > + tdx_no_vcpus_enter_start(kvm); > > + err = tdh_mem_range_block(kvm_tdx->tdr_pa, gpa, tdx_level, &entry, &level_state); > > + tdx_no_vcpus_enter_stop(kvm); > > + } > > if (KVM_BUG_ON(err, kvm)) { > > pr_tdx_error_2(TDH_MEM_RANGE_BLOCK, err, entry, level_state); > > return -EIO; > > @@ -1770,9 +1810,13 @@ static void tdx_track(struct kvm *kvm) > > lockdep_assert_held_write(&kvm->mmu_lock); > > - do { > > + err = tdh_mem_track(kvm_tdx->tdr_pa); > > + if ((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY) { > > + /* After no vCPUs enter, the second retry is expected to succeed */ > > + tdx_no_vcpus_enter_start(kvm); > > err = tdh_mem_track(kvm_tdx->tdr_pa); > > - } while (unlikely((err & TDX_SEAMCALL_STATUS_MASK) == TDX_OPERAND_BUSY)); > > + tdx_no_vcpus_enter_stop(kvm); > > + } > > > [...]