On Thu, 2009-04-09 at 23:50 +0800, Avi Kivity wrote: > Huang Ying wrote: > > +int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) > > +{ > > + switch (msr) { > > + case MSR_EFER: > > + set_efer(vcpu, data); > > break; > > case MSR_IA32_DEBUGCTLMSR: > > if (!data) { > > @@ -807,6 +828,8 @@ int kvm_set_msr_common(struct kvm_vcpu * > > break; > > } > > default: > > + if (!set_msr_mce(vcpu, msr, data)) > > + break; > > pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", msr, data); > > return 1; > > } > > > > Is there any reason you split kvm_set_msr_common() into two functions? I want to group MCE related MSR together. And most MCE MSR read/write need to access vcpu->arch.mcg_xxx or vcpu->arch_mce_banks, So I think use a MCE specific function would be cleaner. But It seems that something as follow would be better. kvm_set_msr_comm() { switch (msr) { case MSR_IA32_P5_MC_ADDR: case MSR_IA32_P5_MC_TYPE: case MSR_IA32_MCG_CAP: case MSR_IA32_MCG_CTL: case MSR_IA32_MCG_STATUS: case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_MISC + 4 * KVM_MCE_MAX_BANK: set_msr_mce(); break; ... } ... } > > + > > +int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) > > +{ > > + u64 data; > > + > > + switch (msr) { > > + case 0xc0010010: /* SYSCFG */ > > + case 0xc0010015: /* HWCR */ > > > > Please use MSR_ constants (add them if they don't exist yet). In fact, this is not added by me. But I can change this by the way. > > > > +static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, > > + u64 mcg_cap) > > +{ > > + int r; > > + unsigned bank_num = mcg_cap & 0xff, bank; > > + u64 *banks; > > + > > + r = -EINVAL; > > + if (!bank_num) > > + goto out; > > + r = -ENOMEM; > > + banks = kzalloc(bank_num * sizeof(u64) * 4, GFP_KERNEL); > > > > If banks is already allocated, you'll leak it. Yes. I will fix this. > Why not always allocate it on vcpu setup? Because the MCE bank number is not fixed, it is based on mcg_cap from user space. > > +static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, > > + struct kvm_x86_mce *mce) > > +{ > > + u64 mcg_cap = vcpu->arch.mcg_cap; > > + unsigned bank_num = mcg_cap & 0xff; > > + u64 *banks = vcpu->arch.mce_banks; > > + > > + if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) > > + return -EINVAL; > > + /* > > + * if IA32_MCG_CTL is not all 1s, the uncorrected error > > + * reporting is disabled > > + */ > > + if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && > > + vcpu->arch.mcg_ctl != ~(u64)0) > > + return 0; > > + banks += 4 * mce->bank; > > + /* > > + * if IA32_MCi_CTL is not all 1s, the uncorrected error > > + * reporting is disabled for the bank > > + */ > > + if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0) > > + return 0; > > + if (mce->status & MCI_STATUS_UC) { > > + u64 status = mce->status; > > + if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || > > + !(vcpu->arch.cr4 & X86_CR4_MCE)) { > > + printk(KERN_DEBUG "kvm: set_mce: " > > + "injects mce exception while " > > + "previous one is in progress!\n"); > > + set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests); > > + return 0; > > + } > > + if (banks[1] & MCI_STATUS_VAL) > > + status |= MCI_STATUS_OVER; > > + banks[1] = mce->status; > > + banks[2] = mce->addr; > > + banks[3] = mce->misc; > > + vcpu->arch.mcg_status = mce->mcg_status; > > + kvm_queue_exception(vcpu, MC_VECTOR); > > + } else if (!(banks[1] & MCI_STATUS_VAL) || > > + (!(banks[1] & MCI_STATUS_UC) && > > + !((mcg_cap & MCG_TES_P) && ((banks[1]>>53) & 0x3) < 2))) { > > + u64 status = mce->status; > > + if (banks[1] & MCI_STATUS_VAL) > > + status |= MCI_STATUS_OVER; > > + banks[1] = mce->status; > > + banks[2] = mce->addr; > > + banks[3] = mce->misc; > > + } else > > + banks[1] |= MCI_STATUS_OVER; > > + return 0; > > +} > > > > Can userspace just use KVM_SET_MSR for this? In addition to assigning MSR, we have some other logic for MCE, such as BANK reporting disabling, overwriting rules, triple fault for UC MCE during MCIP. So I think we need some dedicate interface. > > + case KVM_X86_SETUP_MCE: { > > + u64 mcg_cap; > > + > > + r = -EFAULT; > > + if (copy_from_user(&mcg_cap, argp, sizeof mcg_cap)) > > + goto out; > > + /* > > + * extended machine-check state registers and CMCI are > > + * not supported. > > + */ > > + mcg_cap &= ~(MCG_EXT_P|MCG_CMCI_P); > > > > Instead of silently dropping, should return an error. > > > + if (copy_to_user(argp, &mcg_cap, sizeof mcg_cap)) > > + goto out; > > > > And not copy. This is designed as some kind of feature negotiating. Use space request MCE features via mcg_cap, kernel space remove un-supported features and return the resulting mcg_cap. > > > #define KVM_TRC_SHIFT 16 > > /* > > * kvm trace categories > > @@ -528,6 +540,9 @@ struct kvm_irq_routing { > > #define KVM_NMI _IO(KVMIO, 0x9a) > > /* Available with KVM_CAP_SET_GUEST_DEBUG */ > > #define KVM_SET_GUEST_DEBUG _IOW(KVMIO, 0x9b, struct kvm_guest_debug) > > +/* MCE for x86 */ > > +#define KVM_X86_SETUP_MCE _IOWR(KVMIO, 0x9a, __u64) > > +#define KVM_X86_SET_MCE _IOW(KVMIO, 0x9b, struct kvm_x86_mce) > > > > Please add a KVM_CAP_ to advertise this capability. Yes. I will do this. Best Regards, Huang Ying
Attachment:
signature.asc
Description: This is a digitally signed message part