> -----Original Message----- > From: kvm-owner@xxxxxxxxxxxxxxx [mailto:kvm-owner@xxxxxxxxxxxxxxx] On > Behalf Of Christoffer Dall > Sent: Sunday, September 16, 2012 12:36 AM > To: kvm@xxxxxxxxxxxxxxx; linux-arm-kernel@xxxxxxxxxxxxxxxxxxx; > kvmarm@xxxxxxxxxxxxxxxxxxxxx > Subject: [PATCH 13/15] KVM: ARM: Handle guest faults in KVM > > From: Christoffer Dall <cdall@xxxxxxxxxxxxxxx> > > Handles the guest faults in KVM by mapping in corresponding user pages in > the 2nd stage page tables. > > We invalidate the instruction cache by MVA whenever we map a page to the > guest (no, we cannot only do it when we have an iabt because the guest may > happily read/write a page before hitting the icache) if the hardware uses > VIPT or PIPT. In the latter case, we can invalidate only that physical > page. In the first case, all bets are off and we simply must invalidate > the whole affair. Not that VIVT icaches are tagged with vmids, and we are > out of the woods on that one. Alexander Graf was nice enough to remind us > of this massive pain. > > There may be a subtle bug hidden in, which we currently hide by marking > all pages dirty even when the pages are only mapped read-only. The > current hypothesis is that marking pages dirty may exercise the IO system > and data cache more and therefore we don't see stale data in the guest, > but it's purely guesswork. The bug is manifested by seemingly random > kernel crashes in guests when the host is under extreme memory pressure > and swapping is enabled. > > Signed-off-by: Marc Zyngier <marc.zyngier@xxxxxxx> > Signed-off-by: Christoffer Dall <c.dall@xxxxxxxxxxxxxxxxxxxxxx> > --- > arch/arm/include/asm/kvm_arm.h | 9 ++ > arch/arm/include/asm/kvm_asm.h | 2 + > arch/arm/kvm/mmu.c | 155 > ++++++++++++++++++++++++++++++++++++++++ > arch/arm/kvm/trace.h | 28 +++++++ > 4 files changed, 193 insertions(+), 1 deletion(-) > > diff --git a/arch/arm/include/asm/kvm_arm.h > b/arch/arm/include/asm/kvm_arm.h index ae586c1..4cff3b7 100644 > --- a/arch/arm/include/asm/kvm_arm.h > +++ b/arch/arm/include/asm/kvm_arm.h > @@ -158,11 +158,20 @@ > #define HSR_ISS (HSR_IL - 1) > #define HSR_ISV_SHIFT (24) > #define HSR_ISV (1U << HSR_ISV_SHIFT) > +#define HSR_FSC (0x3f) > +#define HSR_FSC_TYPE (0x3c) > +#define HSR_WNR (1 << 6) > #define HSR_CV_SHIFT (24) > #define HSR_CV (1U << HSR_CV_SHIFT) > #define HSR_COND_SHIFT (20) > #define HSR_COND (0xfU << HSR_COND_SHIFT) > > +#define FSC_FAULT (0x04) > +#define FSC_PERM (0x0c) > + > +/* Hyp Prefetch Fault Address Register (HPFAR/HDFAR) */ > +#define HPFAR_MASK (~0xf) > + > #define HSR_EC_UNKNOWN (0x00) > #define HSR_EC_WFI (0x01) > #define HSR_EC_CP15_32 (0x03) > diff --git a/arch/arm/include/asm/kvm_asm.h > b/arch/arm/include/asm/kvm_asm.h index 201ec1f..40ee099 100644 > --- a/arch/arm/include/asm/kvm_asm.h > +++ b/arch/arm/include/asm/kvm_asm.h > @@ -43,6 +43,8 @@ extern char __kvm_hyp_vector[]; extern char > __kvm_hyp_code_start[]; extern char __kvm_hyp_code_end[]; > > +extern void __kvm_tlb_flush_vmid(struct kvm *kvm); > + > extern void __kvm_flush_vm_context(void); extern void > __kvm_tlb_flush_vmid(struct kvm *kvm); > > diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index > ea17a97..52cc280 100644 > --- a/arch/arm/kvm/mmu.c > +++ b/arch/arm/kvm/mmu.c > @@ -21,10 +21,16 @@ > #include <linux/io.h> > #include <asm/idmap.h> > #include <asm/pgalloc.h> > +#include <asm/cacheflush.h> > #include <asm/kvm_arm.h> > #include <asm/kvm_mmu.h> > #include <asm/kvm_asm.h> > +#include <asm/kvm_emulate.h> > #include <asm/mach/map.h> > +#include <asm/kvm_asm.h> > +#include <trace/events/kvm.h> > + > +#include "trace.h" > > static DEFINE_MUTEX(kvm_hyp_pgd_mutex); > > @@ -490,9 +496,156 @@ out: > return ret; > } > > +static void coherent_icache_guest_page(struct kvm *kvm, gfn_t gfn) { > + /* > + * If we are going to insert an instruction page and the icache is > + * either VIPT or PIPT, there is a potential problem where the host > + * (or another VM) may have used this page at the same virtual > address > + * as this guest, and we read incorrect data from the icache. If > + * we're using a PIPT cache, we can invalidate just that page, but > if > + * we are using a VIPT cache we need to invalidate the entire > icache - > + * damn shame - as written in the ARM ARM (DDI 0406C - Page B3-1384) > + */ > + if (icache_is_pipt()) { > + unsigned long hva = gfn_to_hva(kvm, gfn); > + __cpuc_coherent_user_range(hva, hva + PAGE_SIZE); > + } else if (!icache_is_vivt_asid_tagged()) { > + /* any kind of VIPT cache */ > + __flush_icache_all(); > + } > +} > + > +static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, > + gfn_t gfn, struct kvm_memory_slot *memslot, > + bool is_iabt, unsigned long fault_status) { > + pte_t new_pte; > + pfn_t pfn, pfn_existing = KVM_PFN_ERR_BAD; > + int ret; > + bool write_fault, writable; > + struct kvm_mmu_memory_cache *memcache = &vcpu->arch.mmu_page_cache; > + > + if (is_iabt) > + write_fault = false; > + else if ((vcpu->arch.hsr & HSR_ISV) && !(vcpu->arch.hsr & HSR_WNR)) > + write_fault = false; > + else > + write_fault = true; > + > + if (fault_status == FSC_PERM && !write_fault) { > + kvm_err("Unexpected L2 read permission error\n"); > + return -EFAULT; > + } > + > + /* > + * If this is a write fault (think COW) we need to make sure the > + * existing page, which other CPUs might still read, doesn't go > away > + * from under us, by calling gfn_to_pfn_prot(write_fault=true). > + * Therefore, we call gfn_to_pfn_prot(write_fault=false), which > will > + * pin the existing page, then we get a new page for the user space > + * pte and map this in the stage-2 table where we also make sure to > + * flush the TLB for the VM, if there was an existing entry (the > entry > + * was updated setting the write flag to the potentially new page). > + */ > + if (fault_status == FSC_PERM) { > + pfn_existing = gfn_to_pfn_prot(vcpu->kvm, gfn, false, NULL); > + if (is_error_pfn(pfn_existing)) > + return -EFAULT; > + } > + > + pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write_fault, &writable); > + if (is_error_pfn(pfn)) { > + ret = -EFAULT; > + goto out_put_existing; > + } > + > + /* We need minimum second+third level pages */ > + ret = mmu_topup_memory_cache(memcache, 2, KVM_NR_MEM_OBJS); > + if (ret) > + goto out; > + new_pte = pfn_pte(pfn, PAGE_KVM_GUEST); > + if (writable) > + pte_val(new_pte) |= L_PTE2_WRITE; > + coherent_icache_guest_page(vcpu->kvm, gfn); why don't you flush icache only when guest has mapped executable page as __sync_icache_dcache function does currently? > + > + spin_lock(&vcpu->kvm->arch.pgd_lock); > + stage2_set_pte(vcpu->kvm, memcache, fault_ipa, &new_pte); > + spin_unlock(&vcpu->kvm->arch.pgd_lock); > + > +out: > + /* > + * XXX TODO FIXME: > + * This is _really_ *weird* !!! > + * We should only be calling the _dirty verison when we map > something > + * writable, but this causes memory failures in guests under heavy > + * memory pressure on the host and heavy swapping. > + */ > + kvm_release_pfn_dirty(pfn); > +out_put_existing: > + if (!is_error_pfn(pfn_existing)) > + kvm_release_pfn_clean(pfn_existing); > + return ret; > +} > + > +/** > + * kvm_handle_guest_abort - handles all 2nd stage aborts > + * @vcpu: the VCPU pointer > + * @run: the kvm_run structure > + * > + * Any abort that gets to the host is almost guaranteed to be caused by > +a > + * missing second stage translation table entry, which can mean that > +either the > + * guest simply needs more memory and we must allocate an appropriate > +page or it > + * can mean that the guest tried to access I/O memory, which is > +emulated by user > + * space. The distinction is based on the IPA causing the fault and > +whether this > + * memory region has been registered as standard RAM by user space. > + */ > int kvm_handle_guest_abort(struct kvm_vcpu *vcpu, struct kvm_run *run) { > - return -EINVAL; > + unsigned long hsr_ec; > + unsigned long fault_status; > + phys_addr_t fault_ipa; > + struct kvm_memory_slot *memslot = NULL; > + bool is_iabt; > + gfn_t gfn; > + int ret; > + > + hsr_ec = vcpu->arch.hsr >> HSR_EC_SHIFT; > + is_iabt = (hsr_ec == HSR_EC_IABT); > + fault_ipa = ((phys_addr_t)vcpu->arch.hpfar & HPFAR_MASK) << 8; > + > + trace_kvm_guest_fault(*vcpu_pc(vcpu), vcpu->arch.hsr, > + vcpu->arch.hdfar, vcpu->arch.hifar, fault_ipa); > + > + /* Check the stage-2 fault is trans. fault or write fault */ > + fault_status = (vcpu->arch.hsr & HSR_FSC_TYPE); > + if (fault_status != FSC_FAULT && fault_status != FSC_PERM) { > + kvm_err("Unsupported fault status: EC=%#lx DFCS=%#lx\n", > + hsr_ec, fault_status); > + return -EFAULT; > + } > + > + gfn = fault_ipa >> PAGE_SHIFT; > + if (!kvm_is_visible_gfn(vcpu->kvm, gfn)) { > + if (is_iabt) { > + /* Prefetch Abort on I/O address */ > + kvm_inject_pabt(vcpu, vcpu->arch.hifar); > + return 1; > + } > + > + kvm_pr_unimpl("I/O address abort..."); > + return 0; > + } > + > + memslot = gfn_to_memslot(vcpu->kvm, gfn); > + if (!memslot->user_alloc) { > + kvm_err("non user-alloc memslots not supported\n"); > + return -EINVAL; > + } > + > + ret = user_mem_abort(vcpu, fault_ipa, gfn, memslot, > + is_iabt, fault_status); > + return ret ? ret : 1; > } > > static void handle_hva_to_gpa(struct kvm *kvm, unsigned long hva, diff -- > git a/arch/arm/kvm/trace.h b/arch/arm/kvm/trace.h index 772e045..40606c9 > 100644 > --- a/arch/arm/kvm/trace.h > +++ b/arch/arm/kvm/trace.h > @@ -39,6 +39,34 @@ TRACE_EVENT(kvm_exit, > TP_printk("PC: 0x%08lx", __entry->vcpu_pc) ); > > +TRACE_EVENT(kvm_guest_fault, > + TP_PROTO(unsigned long vcpu_pc, unsigned long hsr, > + unsigned long hdfar, unsigned long hifar, > + unsigned long ipa), > + TP_ARGS(vcpu_pc, hsr, hdfar, hifar, ipa), > + > + TP_STRUCT__entry( > + __field( unsigned long, vcpu_pc ) > + __field( unsigned long, hsr ) > + __field( unsigned long, hdfar ) > + __field( unsigned long, hifar ) > + __field( unsigned long, ipa ) > + ), > + > + TP_fast_assign( > + __entry->vcpu_pc = vcpu_pc; > + __entry->hsr = hsr; > + __entry->hdfar = hdfar; > + __entry->hifar = hifar; > + __entry->ipa = ipa; > + ), > + > + TP_printk("guest fault at PC %#08lx (hdfar %#08lx, hifar %#08lx, " > + "ipa %#08lx, hsr %#08lx", > + __entry->vcpu_pc, __entry->hdfar, __entry->hifar, > + __entry->hsr, __entry->ipa) > +); > + > TRACE_EVENT(kvm_irq_line, > TP_PROTO(unsigned int type, int vcpu_idx, int irq_num, int level), > TP_ARGS(type, vcpu_idx, irq_num, level), > > -- > To unsubscribe from this list: send the line "unsubscribe kvm" in the body > of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at > http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html