Hi Marc, On 8/10/23 16:44, Marc Zyngier wrote: > Hi Eric, > > On Wed, 09 Aug 2023 14:27:27 +0100, > Eric Auger <eric.auger@xxxxxxxxxx> wrote: >> Hi Marc, >> >> On 8/8/23 13:46, Marc Zyngier wrote: >>> A significant part of what a NV hypervisor needs to do is to decide >>> whether a trap from a L2+ guest has to be forwarded to a L1 guest >>> or handled locally. This is done by checking for the trap bits that >>> the guest hypervisor has set and acting accordingly, as described by >>> the architecture. >>> >>> A previous approach was to sprinkle a bunch of checks in all the >>> system register accessors, but this is pretty error prone and doesn't >>> help getting an overview of what is happening. >>> >>> Instead, implement a set of global tables that describe a trap bit, >>> combinations of trap bits, behaviours on trap, and what bits must >>> be evaluated on a system register trap. >>> >>> Although this is painful to describe, this allows to specify each >>> and every control bit in a static manner. To make it efficient, >>> the table is inserted in an xarray that is global to the system, >>> and checked each time we trap a system register while running >>> a L2 guest. >>> >>> Add the basic infrastructure for now, while additional patches will >>> implement configuration registers. >>> >>> Signed-off-by: Marc Zyngier <maz@xxxxxxxxxx> >>> --- >>> arch/arm64/include/asm/kvm_host.h | 1 + >>> arch/arm64/include/asm/kvm_nested.h | 2 + >>> arch/arm64/kvm/emulate-nested.c | 262 ++++++++++++++++++++++++++++ >>> arch/arm64/kvm/sys_regs.c | 6 + >>> arch/arm64/kvm/trace_arm.h | 26 +++ >>> 5 files changed, 297 insertions(+) >>> >>> diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h >>> index 721680da1011..cb1c5c54cedd 100644 >>> --- a/arch/arm64/include/asm/kvm_host.h >>> +++ b/arch/arm64/include/asm/kvm_host.h >>> @@ -988,6 +988,7 @@ int kvm_handle_cp10_id(struct kvm_vcpu *vcpu); >>> void kvm_reset_sys_regs(struct kvm_vcpu *vcpu); >>> >>> int __init kvm_sys_reg_table_init(void); >>> +int __init populate_nv_trap_config(void); >>> >>> bool lock_all_vcpus(struct kvm *kvm); >>> void unlock_all_vcpus(struct kvm *kvm); >>> diff --git a/arch/arm64/include/asm/kvm_nested.h b/arch/arm64/include/asm/kvm_nested.h >>> index 8fb67f032fd1..fa23cc9c2adc 100644 >>> --- a/arch/arm64/include/asm/kvm_nested.h >>> +++ b/arch/arm64/include/asm/kvm_nested.h >>> @@ -11,6 +11,8 @@ static inline bool vcpu_has_nv(const struct kvm_vcpu *vcpu) >>> test_bit(KVM_ARM_VCPU_HAS_EL2, vcpu->arch.features)); >>> } >>> >>> +extern bool __check_nv_sr_forward(struct kvm_vcpu *vcpu); >>> + >>> struct sys_reg_params; >>> struct sys_reg_desc; >>> >>> diff --git a/arch/arm64/kvm/emulate-nested.c b/arch/arm64/kvm/emulate-nested.c >>> index b96662029fb1..1b1148770d45 100644 >>> --- a/arch/arm64/kvm/emulate-nested.c >>> +++ b/arch/arm64/kvm/emulate-nested.c >>> @@ -14,6 +14,268 @@ >>> >>> #include "trace.h" >>> >>> +enum trap_behaviour { >>> + BEHAVE_HANDLE_LOCALLY = 0, >>> + BEHAVE_FORWARD_READ = BIT(0), >>> + BEHAVE_FORWARD_WRITE = BIT(1), >>> + BEHAVE_FORWARD_ANY = BEHAVE_FORWARD_READ | BEHAVE_FORWARD_WRITE, >>> +}; >>> + >>> +struct trap_bits { >>> + const enum vcpu_sysreg index; >>> + const enum trap_behaviour behaviour; >>> + const u64 value; >>> + const u64 mask; >>> +}; >>> + >>> +enum trap_group { >> nit: Maybe add a comment saying that it relates to *coarse* trapping as >> opposed to the other enum which is named fgt_group_id. cgt_group_id may >> have been better but well ;-) > You mean I should apply some sort of consistency to this code in order > to help reviewers understanding it? Fool! ;-) > > Point taken, I'll go over it and apply your suggestion. :-) > >>> + /* Indicates no coarse trap control */ >>> + __RESERVED__, >>> + >>> + /* >>> + * The first batch of IDs denote coarse trapping that are used >>> + * on their own instead of being part of a combination of >>> + * trap controls. >>> + */ >>> + >>> + /* >>> + * Anything after this point is a combination of trap controls, >> coarse trap controls > Yup. > >>> + * which all must be evaluated to decide what to do. >>> + */ >>> + __MULTIPLE_CONTROL_BITS__, >>> + >>> + /* >>> + * Anything after this point requires a callback evaluating a >>> + * complex trap condition. Hopefully we'll never need this... >>> + */ >>> + __COMPLEX_CONDITIONS__, >>> + >>> + /* Must be last */ >>> + __NR_TRAP_GROUP_IDS__ >>> +}; >>> + >>> +static const struct trap_bits coarse_trap_bits[] = { >>> +}; >>> + >>> +#define MCB(id, ...) \ >>> + [id - __MULTIPLE_CONTROL_BITS__] = \ >>> + (const enum trap_group []){ \ >>> + __VA_ARGS__, __RESERVED__ \ >>> + } >>> + >>> +static const enum trap_group *coarse_control_combo[] = { >>> +}; >>> + >>> +typedef enum trap_behaviour (*complex_condition_check)(struct kvm_vcpu *); >>> + >>> +#define CCC(id, fn) \ >>> + [id - __COMPLEX_CONDITIONS__] = fn >>> + >>> +static const complex_condition_check ccc[] = { >>> +}; >>> + >>> +/* >>> + * Bit assignment for the trap controls. We use a 64bit word with the >>> + * following layout for each trapped sysreg: >>> + * >>> + * [9:0] enum trap_group (10 bits) >>> + * [13:10] enum fgt_group_id (4 bits) >>> + * [19:14] bit number in the FGT register (6 bits) >>> + * [20] trap polarity (1 bit) >>> + * [62:21] Unused (42 bits) >>> + * [63] RES0 - Must be zero, as lost on insertion in the xarray >> what do you mean by "as lost" > The xarray is only able to store 63 bit of information, not 64, as it > uses the LSB for its own purpose. This isn't a problem when storing a > pointer (the last 2 bits are usually 0 if pointing to a structure). > However, things get funny when you're trying to assign an integer > value to an xarray location. > > This is why we use the xa_mk_value() helper, which is defined as: > > static inline void *xa_mk_value(unsigned long v) > { > WARN_ON((long)v < 0); > return (void *)((v << 1) | 1); > } > > and the opposite helper on retrieval: > > static inline unsigned long xa_to_value(const void *entry) > { > return (unsigned long)entry >> 1; > } > > As you now tell, the MSB is lost on insertion. Hence the comment that > warns against the use of bit 63, as we will never get it back. I'll > add some extra checks for that in the code that populates the trap > configuration. OK thank you for the clarification/education! Eric > >>> + */ >>> +#define TC_CGT_BITS 10 >>> +#define TC_FGT_BITS 4 >>> + >>> +union trap_config { >>> + u64 val; >>> + struct { >>> + unsigned long cgt:TC_CGT_BITS; /* Coarse trap id */ >>> + unsigned long fgt:TC_FGT_BITS; /* Fing Grained Trap id */ >> Fine & align capital letter in Trap for both comments > Done. > >>> + unsigned long bit:6; /* Bit number */ >>> + unsigned long pol:1; /* Polarity */ >>> + unsigned long unk:42; /* Unknown */ >> s//Unknown/Unused? > Yup, that's better. > >>> + unsigned long mbz:1; /* Must Be Zero */ >>> + }; >>> +}; >>> + >>> +struct encoding_to_trap_config { >>> + const u32 encoding; >>> + const u32 end; >>> + const union trap_config tc; >>> +}; >>> + >>> +#define SR_RANGE_TRAP(sr_start, sr_end, trap_id) \ >>> + { \ >>> + .encoding = sr_start, \ >>> + .end = sr_end, \ >>> + .tc = { \ >>> + .cgt = trap_id, \ >>> + }, \ >>> + } >>> + >>> +#define SR_TRAP(sr, trap_id) SR_RANGE_TRAP(sr, sr, trap_id) >>> + >>> +/* >>> + * Map encoding to trap bits for exception reported with EC=0x18. >>> + * These must only be evaluated when running a nested hypervisor, but >>> + * that the current context is not a hypervisor context. When the >>> + * trapped access matches one of the trap controls, the exception is >>> + * re-injected in the nested hypervisor. >> I must confess I was confused by the "forwarding" terminology versus >> "re-injection into the nested hyp" >> >> cf. >> >> "decide whether a trap from a L2+ guest has to be forwarded to a L1 guest >> or handled locally" >> >> "re-injection into the nested hyp" sounds clearer to me. > I see them as two sides of the same coin: the "forwarding" is the > high-level action (we pass the exception on to the L1 hypervisor). The > "re-injection" is the low-level implementation of the forwarding, > which is a complicated process (a full world switch). understood. > >>> + */ >>> +static const struct encoding_to_trap_config encoding_to_cgt[] __initconst = { >>> +}; >>> + >>> +static DEFINE_XARRAY(sr_forward_xa); >>> + >>> +static union trap_config get_trap_config(u32 sysreg) >>> +{ >>> + return (union trap_config) { >>> + .val = xa_to_value(xa_load(&sr_forward_xa, sysreg)), >>> + }; >>> +} >>> + >>> +int __init populate_nv_trap_config(void) >>> +{ >>> + int ret = 0; >>> + >>> + BUILD_BUG_ON(sizeof(union trap_config) != sizeof(void *)); >>> + BUILD_BUG_ON(__NR_TRAP_GROUP_IDS__ > BIT(TC_CGT_BITS)); >>> + >>> + for (int i = 0; i < ARRAY_SIZE(encoding_to_cgt); i++) { >>> + const struct encoding_to_trap_config *cgt = &encoding_to_cgt[i]; >>> + void *prev; >>> + >>> + prev = xa_store_range(&sr_forward_xa, cgt->encoding, cgt->end, >>> + xa_mk_value(cgt->tc.val), GFP_KERNEL); >>> + >>> + if (prev) { >>> + kvm_err("Duplicate CGT for (%d, %d, %d, %d, %d)\n", >>> + sys_reg_Op0(cgt->encoding), >>> + sys_reg_Op1(cgt->encoding), >>> + sys_reg_CRn(cgt->encoding), >>> + sys_reg_CRm(cgt->encoding), >>> + sys_reg_Op2(cgt->encoding)); >>> + ret = -EINVAL; >>> + } >>> + } >>> + >>> + kvm_info("nv: %ld coarse grained trap handlers\n", >>> + ARRAY_SIZE(encoding_to_cgt)); >>> + >>> + for (int id = __MULTIPLE_CONTROL_BITS__; >>> + id < (__COMPLEX_CONDITIONS__ - 1); >>> + id++) { >>> + const enum trap_group *cgids; >>> + >>> + cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__]; >>> + >>> + for (int i = 0; cgids[i] != __RESERVED__; i++) { >>> + if (cgids[i] >= __MULTIPLE_CONTROL_BITS__) { >>> + kvm_err("Recursive MCB %d/%d\n", id, cgids[i]); >>> + ret = -EINVAL; >>> + } >>> + } >>> + } >>> + >>> + if (ret) >>> + xa_destroy(&sr_forward_xa); >>> + >>> + return ret; >>> +} >>> + >>> +static enum trap_behaviour get_behaviour(struct kvm_vcpu *vcpu, >>> + const struct trap_bits *tb) >>> +{ >>> + enum trap_behaviour b = BEHAVE_HANDLE_LOCALLY; >>> + u64 val; >>> + >>> + val = __vcpu_sys_reg(vcpu, tb->index); >>> + if ((val & tb->mask) == tb->value) >>> + b |= tb->behaviour; >>> + >>> + return b; >>> +} >>> + >>> +static enum trap_behaviour __do_compute_trap_behaviour(struct kvm_vcpu *vcpu, >>> + const enum trap_group id, >>> + enum trap_behaviour b) >>> +{ >>> + switch (id) { >>> + const enum trap_group *cgids; >>> + >>> + case __RESERVED__ ... __MULTIPLE_CONTROL_BITS__ - 1: >>> + if (likely(id != __RESERVED__)) >>> + b |= get_behaviour(vcpu, &coarse_trap_bits[id]); >>> + break; >>> + case __MULTIPLE_CONTROL_BITS__ ... __COMPLEX_CONDITIONS__ - 1: >>> + /* Yes, this is recursive. Don't do anything stupid. */ >>> + cgids = coarse_control_combo[id - __MULTIPLE_CONTROL_BITS__]; >>> + for (int i = 0; cgids[i] != __RESERVED__; i++) >>> + b |= __do_compute_trap_behaviour(vcpu, cgids[i], b); >>> + break; >>> + default: >>> + if (ARRAY_SIZE(ccc)) >>> + b |= ccc[id - __COMPLEX_CONDITIONS__](vcpu); >>> + break; >>> + } >>> + >>> + return b; >>> +} >>> + >>> +static enum trap_behaviour compute_trap_behaviour(struct kvm_vcpu *vcpu, >>> + const union trap_config tc) >>> +{ >>> + enum trap_behaviour b = BEHAVE_HANDLE_LOCALLY; >>> + >>> + return __do_compute_trap_behaviour(vcpu, tc.cgt, b); >>> +} >>> + >>> +bool __check_nv_sr_forward(struct kvm_vcpu *vcpu) >>> +{ >>> + union trap_config tc; >>> + enum trap_behaviour b; >>> + bool is_read; >>> + u32 sysreg; >>> + u64 esr; >>> + >>> + if (!vcpu_has_nv(vcpu) || is_hyp_ctxt(vcpu)) >>> + return false; >>> + >>> + esr = kvm_vcpu_get_esr(vcpu); >>> + sysreg = esr_sys64_to_sysreg(esr); >>> + is_read = (esr & ESR_ELx_SYS64_ISS_DIR_MASK) == ESR_ELx_SYS64_ISS_DIR_READ; >>> + >>> + tc = get_trap_config(sysreg); >>> + >>> + /* >>> + * A value of 0 for the whole entry means that we know nothing >>> + * for this sysreg, and that it cannot be forwareded. In this >> forwarded > Fixed. > > Thanks again for all your suggestions. you're welcome! Eric > > M. >