On Tue, 2009-07-07 at 16:17 +0200, Alexander Graf wrote: > This is the of entry / exit code. In order to switch between host and guest > context, we need to switch register state and call the exit code handler on > exit. > > This assembly file does exactly that. To finally enter the guest it calls > into 970_slb.S. On exit it gets jumped at from 970_slb.S too. > > Add header definition for highmem handler > > Signed-off-by: Alexander Graf <agraf@xxxxxxx> > --- Why "highmem" ? IE, That terminology usually means something completely different in Linux :-) (Aka, memory beyond the linear mapping which is a concept that does not exist on ppc64). I suppose you mean code that runs outside of the RMA ? (AKA Real Memory Area, which is the memory that can be accessed while in real mode). I have a few comments, this is in no way an in-depth review, I don't yet totally see the big picture of your implementation but a few things I spotted along the way: > arch/powerpc/include/asm/kvm_ppc.h | 1 + .../... > + > +/***************************************************************************** > + * * > + * Guest entry / exit code that is in kernel module memory (highmem) * > + * * > + ****************************************************************************/ > + > +/* Registers: > + * r3: kvm_run pointer > + * r4: vcpu pointer > + */ > +_GLOBAL(__kvmppc_vcpu_entry) > + > +kvm_start_entry: > + > + /* Save host state to the stack */ > + stdu r1, -HOST_STACK_SIZE(r1) > + std r3, HOST_STACK_RUN(r1) > + std r4, HOST_STACK_VCPU(r1) > + > + std r14, HOST_STACK_R14(r1) > + std r15, HOST_STACK_R15(r1) > + std r16, HOST_STACK_R16(r1) > + std r17, HOST_STACK_R17(r1) > + std r18, HOST_STACK_R18(r1) > + std r19, HOST_STACK_R19(r1) > + std r20, HOST_STACK_R20(r1) > + std r21, HOST_STACK_R21(r1) > + std r22, HOST_STACK_R22(r1) > + std r23, HOST_STACK_R23(r1) > + std r24, HOST_STACK_R24(r1) > + std r25, HOST_STACK_R25(r1) > + std r26, HOST_STACK_R26(r1) > + std r27, HOST_STACK_R27(r1) > + std r28, HOST_STACK_R28(r1) > + std r29, HOST_STACK_R29(r1) > + std r30, HOST_STACK_R30(r1) > + std r31, HOST_STACK_R31(r1) > + mflr r14 > + std r14, HOST_STACK_LR(r1) Can we make that look closer to a pt_regs maybe or is that not worth it ? > +/* XXX optimize non-volatile loading away */ > +kvm_start_lightweight: > + > + DISABLE_INTERRUPTS > + > + /* Save R1/R2 in the PACA */ > + std r1, PACAR1(r13) > + std r2, (PACA_EXMC+EX_SRR0)(r13) > + ld r3, VCPU_HIGHMEM_HANDLER(r4) > + std r3, PACASAVEDMSR(r13) > + > + /* Load non-volatile guest state from the vcpu */ > + ld r14, VCPU_GPR(r14)(r4) > + ld r15, VCPU_GPR(r15)(r4) > + ld r16, VCPU_GPR(r16)(r4) > + ld r17, VCPU_GPR(r17)(r4) > + ld r18, VCPU_GPR(r18)(r4) > + ld r19, VCPU_GPR(r19)(r4) > + ld r20, VCPU_GPR(r20)(r4) > + ld r21, VCPU_GPR(r21)(r4) > + ld r22, VCPU_GPR(r22)(r4) > + ld r23, VCPU_GPR(r23)(r4) > + ld r24, VCPU_GPR(r24)(r4) > + ld r25, VCPU_GPR(r25)(r4) > + ld r26, VCPU_GPR(r26)(r4) > + ld r27, VCPU_GPR(r27)(r4) > + ld r28, VCPU_GPR(r28)(r4) > + ld r29, VCPU_GPR(r29)(r4) > + ld r30, VCPU_GPR(r30)(r4) > + ld r31, VCPU_GPR(r31)(r4) > + > + ld r9, VCPU_PC(r4) /* r9 = vcpu->arch.pc */ > + ld r10, VCPU_SHADOW_MSR(r4) /* r10 = vcpu->arch.shadow_msr */ > + > + ld r3, VCPU_TRAMPOLINE_ENTER(r4) > + mtsrr0 r3 > + > + loadimm r3, MSR_KERNEL & ~(MSR_IR | MSR_DR) > + mtsrr1 r3 > + > + /* Load guest state in the respective registers */ > + lwz r3, VCPU_CR(r4) /* r3 = vcpu->arch.cr */ > + stw r3, (PACA_EXMC + EX_CCR)(r13) > + > + ld r3, VCPU_CTR(r4) /* r3 = vcpu->arch.ctr */ > + mtctr r3 /* CTR = r3 */ > + > + ld r3, VCPU_LR(r4) /* r3 = vcpu->arch.lr */ > + mtlr r3 /* LR = r3 */ > + > + ld r3, VCPU_XER(r4) /* r3 = vcpu->arch.xer */ > + std r3, (PACA_EXMC + EX_R3)(r13) > + > + /* This sets the Magic value for the trampoline: > + * > + * PPC64: SPRG3 |= 1 > + */ > + setmagc r3 > + > + /* Some guests may need to have dcbz set to 32 byte length. > + * > + * Usually we ensure that by patching the guest's instructions > + * to trap on dcbz and emulate it in the hypervisor. > + * > + * If we can, we should tell the CPU to use 32 byte dcbz though, > + * because that's a lot faster. > + */ > + > + ld r3, VCPU_HFLAGS(r4) > + rldicl. r3, r3, 0, 63 /* CR = ((r3 & 1) == 0) */ > + beq no_dcbz32_on > + > + mfspr r3,SPRN_HID5 > + ori r3, r3, 0x80 /* XXX HID5_dcbz32 = 0x80 */ > + mtspr SPRN_HID5,r3 > + > +no_dcbz32_on: The whole dcbz stuff could probably be a cpufeature block so it gets nop'ed out when running on other processors than 970 since they don't all support that magic dcbz trick. Also, I think HID5 is a HV reserved register thus you won't be able to do that trick when running yourself with MSR:HV=0, for example when running on a js2x blade. > + /* Save guest DAR */ > + mfdar r5 > + std r5, VCPU_FAULT_DEAR(r12) The guest is running with MSR:PR set to 0 or 1 ? If 1, it doesn't have access to DAR or DSISR so I don't quite see the point of saving/restoring them here, you can just hand out the register straight off your shadow when taking the protection faults as the guest tries to access them. If the guest is running with PR:0 then there is no protection of the host against the guest which sucks :-) Or do I miss something ? > + /* Save guest DSISR */ > + mfdsisr r5 > + std r5, VCPU_FAULT_DSISR(r12) > + > + /* Restore host msr -> SRR1 */ > + ld r7, VCPU_HOST_MSR(r12) > + mtsrr1 r7 > + > + /* Restore host IP -> SRR0 */ > + ld r6, VCPU_HOST_RETIP(r12) > + mtsrr0 r6 > + > + /* For some interrupts, we need to call the real Linux */ > + /* handler, so it can do work for us. This has to happen */ > + /* as if the interrupt arrived from the kernel though, */ > + /* so let's fake it here where most state is restored. */ > + > + /* Call Linux for hardware interrupts/decrementer */ > + /* r3 = address of interrupt handler (exit reason) */ > + > + cmpwi r3, PPC970_INTERRUPT_EXTERNAL > + beq call_linux_handler > + cmpwi r3, PPC970_INTERRUPT_DECREMENTER > + beq call_linux_handler > + > + /* Back to Paged Mode! (goto kvm_return_point) with interrupts enabled */ > + RFI Ok so I need to understand better the whole model... ie how you get in/out of the guest etc... I would have thought you wanted to call into kernel interrupts such as DEC or EE as if coming from userspace actually... > +call_linux_handler: > + > + /* If we land here we need to jump back to the handler we */ > + /* came from. */ > + > + /* We have a page that we can access from real mode, so let's */ > + /* jump back to that and use it as a trampoline to get back into the */ > + /* interrupt handler! */ > + > + /* Enable soft interrupts again, so the handler acts */ > + li r5, 1 > + stb r5, PACASOFTIRQEN(r13) But we aren't supposed to enter the timer or EE with softirq enabled... BTW we probably also need to record some of that stuff with lockdep but we can look at that later. > + /* R3 still contains the exit code, */ > + /* R6 VCPU_HOST_RETIP and */ > + /* R7 VCPU_HOST_MSR */ > + > + mtlr r3 > + > + ld r5, VCPU_TRAMPOLINE_LOWMEM(r12) > + mtsrr0 r5 > + loadimm r5, MSR_KERNEL & ~(MSR_IR | MSR_DR) > + mtsrr1 r5 > + > + RFI > + > +.global kvm_return_point > +kvm_return_point: > + > + /* Jump back to lightweight entry if we're supposed to */ > + /* go back into the guest */ > + mr r5, r3 > + ld r3, HOST_STACK_RUN(r1) > + ld r4, HOST_STACK_VCPU(r1) > + bl KVMPPC_HANDLE_EXIT > + > +#if 0 /* XXX get lightweight exits back */ > + cmpwi r3, RESUME_GUEST > + bne kvm_exit_heavyweight > + > + /* put VCPU and KVM_RUN back into place and roll again! */ > + ld r3, HOST_STACK_RUN(r1) > + ld r4, HOST_STACK_VCPU(r1) > + b kvm_start_lightweight > + > +kvm_exit_heavyweight: > + /* Restore non-volatile host registers */ > + ld r14, HOST_STACK_LR(r1) > + mtlr r14 > + ld r14, HOST_STACK_R14(r1) > + ld r15, HOST_STACK_R15(r1) > + ld r16, HOST_STACK_R16(r1) > + ld r17, HOST_STACK_R17(r1) > + ld r18, HOST_STACK_R18(r1) > + ld r19, HOST_STACK_R19(r1) > + ld r20, HOST_STACK_R20(r1) > + ld r21, HOST_STACK_R21(r1) > + ld r22, HOST_STACK_R22(r1) > + ld r23, HOST_STACK_R23(r1) > + ld r24, HOST_STACK_R24(r1) > + ld r25, HOST_STACK_R25(r1) > + ld r26, HOST_STACK_R26(r1) > + ld r27, HOST_STACK_R27(r1) > + ld r28, HOST_STACK_R28(r1) > + ld r29, HOST_STACK_R29(r1) > + ld r30, HOST_STACK_R30(r1) > + ld r31, HOST_STACK_R31(r1) > + > + addi r1, r1, HOST_STACK_SIZE > +#else > + ld r4, HOST_STACK_LR(r1) > + mtlr r4 > + > + cmpwi r3, RESUME_GUEST > + bne kvm_exit_heavyweight > + > + ld r3, HOST_STACK_RUN(r1) > + ld r4, HOST_STACK_VCPU(r1) > + > + addi r1, r1, HOST_STACK_SIZE > + > + b kvm_start_entry > + > +kvm_exit_heavyweight: > + > + addi r1, r1, HOST_STACK_SIZE > +#endif > + > + blr > -- > 1.6.0.2 > > -- > To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in > the body of a message to majordomo@xxxxxxxxxxxxxxx > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html