On systems that advertise ICH_VTR_EL2.SEIS, we trap all GICv3 sysreg accesses from the guest. From a performance perspective, this is OK as long as the guest doesn't hammer the GICv3 CPU interface. In most cases, this is fine, unless the guest actively uses priorities and switches PMR_EL1 very often. Which is exactly what happens when a Linux guest runs with irqchip.gicv3_pseudo_nmi=1. In these condition, the performance plumets as we hit PMR each time we mask/unmask interrupts. Not good. There is however an opportunity for improvement. Careful reading of the architecture specification indicates that the only GICv3 sysreg belonging to the common group (which contains the SGI registers, PMR, DIR, CTLR and RPR) that is allowed to generate a SError is DIR. Everything else is safe. It is thus possible to substitute the trapping of all the common group with just that of DIR if it supported by the implementation. Yes, that's yet another optional bit of the architecture. So let's just do that, as it leads to some impressive result on the M1: Without this change: bash-5.1# /host/home/maz/hackbench 100 process 1000 Running with 100*40 (== 4000) tasks. Time: 56.596 With this change: bash-5.1# /host/home/maz/hackbench 100 process 1000 Running with 100*40 (== 4000) tasks. Time: 8.649 which is a pretty convincing result. Signed-off-by: Marc Zyngier <maz@xxxxxxxxxx> --- arch/arm64/include/asm/sysreg.h | 3 +++ arch/arm64/kvm/vgic/vgic-v3.c | 15 +++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index b268082d67ed..9412a645a1c0 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -1152,6 +1152,7 @@ #define ICH_HCR_TC (1 << 10) #define ICH_HCR_TALL0 (1 << 11) #define ICH_HCR_TALL1 (1 << 12) +#define ICH_HCR_TDIR (1 << 14) #define ICH_HCR_EOIcount_SHIFT 27 #define ICH_HCR_EOIcount_MASK (0x1f << ICH_HCR_EOIcount_SHIFT) @@ -1184,6 +1185,8 @@ #define ICH_VTR_SEIS_MASK (1 << ICH_VTR_SEIS_SHIFT) #define ICH_VTR_A3V_SHIFT 21 #define ICH_VTR_A3V_MASK (1 << ICH_VTR_A3V_SHIFT) +#define ICH_VTR_TDS_SHIFT 19 +#define ICH_VTR_TDS_MASK (1 << ICH_VTR_TDS_SHIFT) #define ARM64_FEATURE_FIELD_BITS 4 diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index ae59e2580bf5..467c22bbade6 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -15,6 +15,7 @@ static bool group0_trap; static bool group1_trap; static bool common_trap; +static bool dir_trap; static bool gicv4_enable; void vgic_v3_set_underflow(struct kvm_vcpu *vcpu) @@ -296,6 +297,8 @@ void vgic_v3_enable(struct kvm_vcpu *vcpu) vgic_v3->vgic_hcr |= ICH_HCR_TALL1; if (common_trap) vgic_v3->vgic_hcr |= ICH_HCR_TC; + if (dir_trap) + vgic_v3->vgic_hcr |= ICH_HCR_TDIR; } int vgic_v3_lpi_sync_pending_status(struct kvm *kvm, struct vgic_irq *irq) @@ -676,14 +679,18 @@ int vgic_v3_probe(const struct gic_kvm_info *info) group0_trap = true; group1_trap = true; - common_trap = true; + if (ich_vtr_el2 & ICH_VTR_TDS_MASK) + dir_trap = true; + else + common_trap = true; } - if (group0_trap || group1_trap || common_trap) { - kvm_info("GICv3 sysreg trapping enabled ([%s%s%s], reduced performance)\n", + if (group0_trap || group1_trap || common_trap | dir_trap) { + kvm_info("GICv3 sysreg trapping enabled ([%s%s%s%s], reduced performance)\n", group0_trap ? "G0" : "", group1_trap ? "G1" : "", - common_trap ? "C" : ""); + common_trap ? "C" : "", + dir_trap ? "D" : ""); static_branch_enable(&vgic_v3_cpuif_trap); } -- 2.30.2