On Mon, Jun 22, 2015 at 06:41:32PM +0800, Zhichao Huang wrote: > Implement switching of the debug registers. While the number > of registers is massive, CPUs usually don't implement them all > (A15 has 6 breakpoints and 4 watchpoints, which gives us a total > of 22 registers "only"). > > Notice that, for ARMv7, if the CONFIG_HAVE_HW_BREAKPOINT is set in > the guest, debug is always actively in use (ARM_DSCR_MDBGEN set). > > We have to do the save/restore dance in this case, because the host > and the guest might use their respective debug registers at any moment. this sounds expensive, and I suggested an alternative approach in the previsou patch. In any case, measuring the impact on this on hardware would be a great idea... > > If the CONFIG_HAVE_HW_BREAKPOINT is not set, and if no one flagged > the debug registers as dirty, we only save/resotre DBGDSCR. restore > > Signed-off-by: Zhichao Huang <zhichao.huang@xxxxxxxxxx> > --- > arch/arm/kvm/interrupts.S | 16 +++ > arch/arm/kvm/interrupts_head.S | 249 ++++++++++++++++++++++++++++++++++++++++- > 2 files changed, 263 insertions(+), 2 deletions(-) > > diff --git a/arch/arm/kvm/interrupts.S b/arch/arm/kvm/interrupts.S > index 79caf79..d626275 100644 > --- a/arch/arm/kvm/interrupts.S > +++ b/arch/arm/kvm/interrupts.S > @@ -116,6 +116,12 @@ ENTRY(__kvm_vcpu_run) > read_cp15_state store_to_vcpu = 0 > write_cp15_state read_from_vcpu = 1 > > + @ Store hardware CP14 state and load guest state > + compute_debug_state 1f > + bl __save_host_debug_regs > + bl __restore_guest_debug_regs > + > +1: > @ If the host kernel has not been configured with VFPv3 support, > @ then it is safer if we deny guests from using it as well. > #ifdef CONFIG_VFPv3 > @@ -201,6 +207,16 @@ after_vfp_restore: > mrc p15, 0, r2, c0, c0, 5 > mcr p15, 4, r2, c0, c0, 5 > > + @ Store guest CP14 state and restore host state > + skip_debug_state 1f > + bl __save_guest_debug_regs > + bl __restore_host_debug_regs > + /* Clear the dirty flag for the next run, as all the state has > + * already been saved. Note that we nuke the whole 32bit word. > + * If we ever add more flags, we'll have to be more careful... > + */ > + clear_debug_dirty_bit > +1: > @ Store guest CP15 state and restore host state > read_cp15_state store_to_vcpu = 1 > write_cp15_state read_from_vcpu = 0 > diff --git a/arch/arm/kvm/interrupts_head.S b/arch/arm/kvm/interrupts_head.S > index 5662c39..ed406be 100644 > --- a/arch/arm/kvm/interrupts_head.S > +++ b/arch/arm/kvm/interrupts_head.S > @@ -7,6 +7,7 @@ > #define VCPU_USR_SP (VCPU_USR_REG(13)) > #define VCPU_USR_LR (VCPU_USR_REG(14)) > #define CP15_OFFSET(_cp15_reg_idx) (VCPU_CP15 + (_cp15_reg_idx * 4)) > +#define CP14_OFFSET(_cp14_reg_idx) (VCPU_CP14 + ((_cp14_reg_idx) * 4)) > > /* > * Many of these macros need to access the VCPU structure, which is always > @@ -168,8 +169,7 @@ vcpu .req r0 @ vcpu pointer always in r0 > * Clobbers *all* registers. > */ > .macro restore_guest_regs > - /* reset DBGDSCR to disable debug mode */ > - mov r2, #0 > + ldr r2, [vcpu, #CP14_OFFSET(cp14_DBGDSCRext)] > mcr p14, 0, r2, c0, c2, 2 > > restore_guest_regs_mode svc, #VCPU_SVC_REGS > @@ -250,6 +250,10 @@ vcpu .req r0 @ vcpu pointer always in r0 > save_guest_regs_mode abt, #VCPU_ABT_REGS > save_guest_regs_mode und, #VCPU_UND_REGS > save_guest_regs_mode irq, #VCPU_IRQ_REGS > + > + /* DBGDSCR reg */ > + mrc p14, 0, r2, c0, c1, 0 > + str r2, [vcpu, #CP14_OFFSET(cp14_DBGDSCRext)] > .endm > > /* Reads cp15 registers from hardware and stores them in memory > @@ -449,6 +453,231 @@ vcpu .req r0 @ vcpu pointer always in r0 > str r5, [vcpu, #VCPU_DEBUG_FLAGS] > .endm > > +/* Assume r11/r12 in used, clobbers r2-r10 */ > +.macro cp14_read_and_push Op2 skip_num > + cmp \skip_num, #8 > + // if (skip_num >= 8) then skip c8-c15 directly > + bge 1f > + adr r2, 9998f > + add r2, r2, \skip_num, lsl #2 > + bx r2 > +1: > + adr r2, 9999f > + sub r3, \skip_num, #8 > + add r2, r2, r3, lsl #2 > + bx r2 > +9998: > + mrc p14, 0, r10, c0, c15, \Op2 > + mrc p14, 0, r9, c0, c14, \Op2 > + mrc p14, 0, r8, c0, c13, \Op2 > + mrc p14, 0, r7, c0, c12, \Op2 > + mrc p14, 0, r6, c0, c11, \Op2 > + mrc p14, 0, r5, c0, c10, \Op2 > + mrc p14, 0, r4, c0, c9, \Op2 > + mrc p14, 0, r3, c0, c8, \Op2 > + push {r3-r10} you probably don't want to do more stores to memory than required > +9999: > + mrc p14, 0, r10, c0, c7, \Op2 > + mrc p14, 0, r9, c0, c6, \Op2 > + mrc p14, 0, r8, c0, c5, \Op2 > + mrc p14, 0, r7, c0, c4, \Op2 > + mrc p14, 0, r6, c0, c3, \Op2 > + mrc p14, 0, r5, c0, c2, \Op2 > + mrc p14, 0, r4, c0, c1, \Op2 > + mrc p14, 0, r3, c0, c0, \Op2 > + push {r3-r10} same > +.endm > + > +/* Assume r11/r12 in used, clobbers r2-r10 */ > +.macro cp14_pop_and_write Op2 skip_num > + cmp \skip_num, #8 > + // if (skip_num >= 8) then skip c8-c15 directly > + bge 1f > + adr r2, 9998f > + add r2, r2, \skip_num, lsl #2 > + pop {r3-r10} you probably don't want to do more loads from memory than required > + bx r2 > +1: > + adr r2, 9999f > + sub r3, \skip_num, #8 > + add r2, r2, r3, lsl #2 > + pop {r3-r10} same > + bx r2 > + > +9998: > + mcr p14, 0, r10, c0, c15, \Op2 > + mcr p14, 0, r9, c0, c14, \Op2 > + mcr p14, 0, r8, c0, c13, \Op2 > + mcr p14, 0, r7, c0, c12, \Op2 > + mcr p14, 0, r6, c0, c11, \Op2 > + mcr p14, 0, r5, c0, c10, \Op2 > + mcr p14, 0, r4, c0, c9, \Op2 > + mcr p14, 0, r3, c0, c8, \Op2 > + > + pop {r3-r10} > +9999: > + mcr p14, 0, r10, c0, c7, \Op2 > + mcr p14, 0, r9, c0, c6, \Op2 > + mcr p14, 0, r8, c0, c5, \Op2 > + mcr p14, 0, r7, c0, c4, \Op2 > + mcr p14, 0, r6, c0, c3, \Op2 > + mcr p14, 0, r5, c0, c2, \Op2 > + mcr p14, 0, r4, c0, c1, \Op2 > + mcr p14, 0, r3, c0, c0, \Op2 > +.endm > + > +/* Assume r11/r12 in used, clobbers r2-r3 */ > +.macro cp14_read_and_str Op2 cp14_reg0 skip_num > + adr r3, 1f > + add r3, r3, \skip_num, lsl #3 > + bx r3 > +1: > + mrc p14, 0, r2, c0, c15, \Op2 > + str r2, [vcpu, #CP14_OFFSET(\cp14_reg0+15)] > + mrc p14, 0, r2, c0, c14, \Op2 > + str r2, [vcpu, #CP14_OFFSET(\cp14_reg0+14)] > + mrc p14, 0, r2, c0, c13, \Op2 > + str r2, [vcpu, #CP14_OFFSET(\cp14_reg0+13)] > + mrc p14, 0, r2, c0, c12, \Op2 > + str r2, [vcpu, #CP14_OFFSET(\cp14_reg0+12)] > + mrc p14, 0, r2, c0, c11, \Op2 > + str r2, [vcpu, #CP14_OFFSET(\cp14_reg0+11)] > + mrc p14, 0, r2, c0, c10, \Op2 > + str r2, [vcpu, #CP14_OFFSET(\cp14_reg0+10)] > + mrc p14, 0, r2, c0, c9, \Op2 > + str r2, [vcpu, #CP14_OFFSET(\cp14_reg0+9)] > + mrc p14, 0, r2, c0, c8, \Op2 > + str r2, [vcpu, #CP14_OFFSET(\cp14_reg0+8)] > + mrc p14, 0, r2, c0, c7, \Op2 > + str r2, [vcpu, #CP14_OFFSET(\cp14_reg0+7)] > + mrc p14, 0, r2, c0, c6, \Op2 > + str r2, [vcpu, #CP14_OFFSET(\cp14_reg0+6)] > + mrc p14, 0, r2, c0, c5, \Op2 > + str r2, [vcpu, #CP14_OFFSET(\cp14_reg0+5)] > + mrc p14, 0, r2, c0, c4, \Op2 > + str r2, [vcpu, #CP14_OFFSET(\cp14_reg0+4)] > + mrc p14, 0, r2, c0, c3, \Op2 > + str r2, [vcpu, #CP14_OFFSET(\cp14_reg0+3)] > + mrc p14, 0, r2, c0, c2, \Op2 > + str r2, [vcpu, #CP14_OFFSET(\cp14_reg0+2)] > + mrc p14, 0, r2, c0, c1, \Op2 > + str r2, [vcpu, #CP14_OFFSET(\cp14_reg0+1)] > + mrc p14, 0, r2, c0, c0, \Op2 > + str r2, [vcpu, #CP14_OFFSET(\cp14_reg0)] > +.endm > + > +/* Assume r11/r12 in used, clobbers r2-r3 */ > +.macro cp14_ldr_and_write Op2 cp14_reg0 skip_num > + adr r3, 1f > + add r3, r3, \skip_num, lsl #3 > + bx r3 > +1: > + ldr r2, [vcpu, #CP14_OFFSET(\cp14_reg0+15)] > + mcr p14, 0, r2, c0, c15, \Op2 > + ldr r2, [vcpu, #CP14_OFFSET(\cp14_reg0+14)] > + mcr p14, 0, r2, c0, c14, \Op2 > + ldr r2, [vcpu, #CP14_OFFSET(\cp14_reg0+13)] > + mcr p14, 0, r2, c0, c13, \Op2 > + ldr r2, [vcpu, #CP14_OFFSET(\cp14_reg0+12)] > + mcr p14, 0, r2, c0, c12, \Op2 > + ldr r2, [vcpu, #CP14_OFFSET(\cp14_reg0+11)] > + mcr p14, 0, r2, c0, c11, \Op2 > + ldr r2, [vcpu, #CP14_OFFSET(\cp14_reg0+10)] > + mcr p14, 0, r2, c0, c10, \Op2 > + ldr r2, [vcpu, #CP14_OFFSET(\cp14_reg0+9)] > + mcr p14, 0, r2, c0, c9, \Op2 > + ldr r2, [vcpu, #CP14_OFFSET(\cp14_reg0+8)] > + mcr p14, 0, r2, c0, c8, \Op2 > + ldr r2, [vcpu, #CP14_OFFSET(\cp14_reg0+7)] > + mcr p14, 0, r2, c0, c7, \Op2 > + ldr r2, [vcpu, #CP14_OFFSET(\cp14_reg0+6)] > + mcr p14, 0, r2, c0, c6, \Op2 > + ldr r2, [vcpu, #CP14_OFFSET(\cp14_reg0+5)] > + mcr p14, 0, r2, c0, c5, \Op2 > + ldr r2, [vcpu, #CP14_OFFSET(\cp14_reg0+4)] > + mcr p14, 0, r2, c0, c4, \Op2 > + ldr r2, [vcpu, #CP14_OFFSET(\cp14_reg0+3)] > + mcr p14, 0, r2, c0, c3, \Op2 > + ldr r2, [vcpu, #CP14_OFFSET(\cp14_reg0+2)] > + mcr p14, 0, r2, c0, c2, \Op2 > + ldr r2, [vcpu, #CP14_OFFSET(\cp14_reg0+1)] > + mcr p14, 0, r2, c0, c1, \Op2 > + ldr r2, [vcpu, #CP14_OFFSET(\cp14_reg0)] > + mcr p14, 0, r2, c0, c0, \Op2 > +.endm can you not find some way of unifying cp14_pop_and_write with cp14_ldr_and_write and cp14_read_and_push with cp14_read_and_str ? Probably having two separate structs for the VFP state on the vcpu struct for both the guest and the host state is one possible way of doing so. > + > +/* Get extract number of BRPs and WRPs. Saved in r11/r12 */ > +.macro read_hw_dbg_num > + mrc p14, 0, r2, c0, c0, 0 > + ubfx r11, r2, #24, #4 > + add r11, r11, #1 // Extract BRPs > + ubfx r12, r2, #28, #4 > + add r12, r12, #1 // Extract WRPs > + mov r2, #16 > + sub r11, r2, r11 // How many BPs to skip > + sub r12, r2, r12 // How many WPs to skip > +.endm > + > +/* Reads cp14 registers from hardware. You have a lot of multi-line comments in these patches which don't start with a separate '/*' line, as dictated by the Linux kernel coding style. So far, I've ignored this, but please fix all these throughout the series when you respin. > + * Writes cp14 registers in-order to the stack. > + * > + * Assumes vcpu pointer in vcpu reg > + * > + * Clobbers r2-r12 > + */ > +.macro save_host_debug_regs > + read_hw_dbg_num > + cp14_read_and_push #4, r11 @ DBGBVR > + cp14_read_and_push #5, r11 @ DBGBCR > + cp14_read_and_push #6, r12 @ DBGWVR > + cp14_read_and_push #7, r12 @ DBGWCR > +.endm > + > +/* Reads cp14 registers from hardware. > + * Writes cp14 registers in-order to the VCPU struct pointed to by vcpup. > + * > + * Assumes vcpu pointer in vcpu reg > + * > + * Clobbers r2-r12 > + */ > +.macro save_guest_debug_regs > + read_hw_dbg_num > + cp14_read_and_str #4, cp14_DBGBVR0, r11 why do you need the has before the op2 field? > + cp14_read_and_str #5, cp14_DBGBCR0, r11 > + cp14_read_and_str #6, cp14_DBGWVR0, r12 > + cp14_read_and_str #7, cp14_DBGWCR0, r12 > +.endm > + > +/* Reads cp14 registers in-order from the stack. > + * Writes cp14 registers to hardware. > + * > + * Assumes vcpu pointer in vcpu reg > + * > + * Clobbers r2-r12 > + */ > +.macro restore_host_debug_regs > + read_hw_dbg_num > + cp14_pop_and_write #4, r11 @ DBGBVR > + cp14_pop_and_write #5, r11 @ DBGBCR > + cp14_pop_and_write #6, r12 @ DBGWVR > + cp14_pop_and_write #7, r12 @ DBGWCR > +.endm > + > +/* Reads cp14 registers in-order from the VCPU struct pointed to by vcpup > + * Writes cp14 registers to hardware. > + * > + * Assumes vcpu pointer in vcpu reg > + * > + * Clobbers r2-r12 > + */ > +.macro restore_guest_debug_regs > + read_hw_dbg_num > + cp14_ldr_and_write #4, cp14_DBGBVR0, r11 > + cp14_ldr_and_write #5, cp14_DBGBCR0, r11 > + cp14_ldr_and_write #6, cp14_DBGWVR0, r12 > + cp14_ldr_and_write #7, cp14_DBGWCR0, r12 > +.endm > + > /* > * Save the VGIC CPU state into memory > * > @@ -684,3 +913,19 @@ ARM_BE8(rev r6, r6 ) > .macro load_vcpu > mrc p15, 4, vcpu, c13, c0, 2 @ HTPIDR > .endm > + > +__save_host_debug_regs: > + save_host_debug_regs > + bx lr > + > +__save_guest_debug_regs: > + save_guest_debug_regs > + bx lr > + > +__restore_host_debug_regs: > + restore_host_debug_regs > + bx lr > + > +__restore_guest_debug_regs: > + restore_guest_debug_regs > + bx lr > -- > 1.7.12.4 > Thanks, -Christoffer _______________________________________________ kvmarm mailing list kvmarm@xxxxxxxxxxxxxxxxxxxxx https://lists.cs.columbia.edu/mailman/listinfo/kvmarm