On 07/31/2012 06:50 AM, Alexander Graf wrote: > > On 25.07.2012, at 05:49, Bharat Bhushan wrote: > >> This patch adds the watchdog emulation in KVM. The watchdog >> emulation is enabled by KVM_ENABLE_CAP(KVM_CAP_PPC_BOOKE_WATCHDOG) ioctl. >> The kernel timer are used for watchdog emulation and emulates >> h/w watchdog state machine. On watchdog timer expiry, it exit to QEMU >> if TCR.WRC is non ZERO. QEMU can reset/shutdown etc depending upon how >> it is configured. >> >> Signed-off-by: Liu Yu <yu.liu@xxxxxxxxxxxxx> >> Signed-off-by: Scott Wood <scottwood@xxxxxxxxxxxxx> >> [bharat.bhushan@xxxxxxxxxxxxx: reworked patch] >> Signed-off-by: Bharat Bhushan <bharat.bhushan@xxxxxxxxxxxxx> >> --- >> v6: >> - Added kvmppc_subarch_vcpu_unit/uninit() for subarch specific initialization >> - Using spin_lock_irqsave() >> - Using del_timer_sync(). >> >> v5: >> - Checking that TSR_ENW/TSR_WIS are still set on KVM_EXIT_WATCHDOG. >> - Moved watchdog_next_timeout() in lock. >> >> arch/powerpc/include/asm/kvm_host.h | 3 + >> arch/powerpc/include/asm/kvm_ppc.h | 4 + >> arch/powerpc/include/asm/reg_booke.h | 7 ++ >> arch/powerpc/kvm/book3s.c | 9 ++ >> arch/powerpc/kvm/booke.c | 156 ++++++++++++++++++++++++++++++++++ >> arch/powerpc/kvm/booke_emulate.c | 8 ++ >> arch/powerpc/kvm/powerpc.c | 14 +++- >> include/linux/kvm.h | 2 + >> include/linux/kvm_host.h | 1 + >> 9 files changed, 202 insertions(+), 2 deletions(-) >> >> diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h >> index 50ea12f..43cac48 100644 >> --- a/arch/powerpc/include/asm/kvm_host.h >> +++ b/arch/powerpc/include/asm/kvm_host.h >> @@ -467,6 +467,8 @@ struct kvm_vcpu_arch { >> ulong fault_esr; >> ulong queued_dear; >> ulong queued_esr; >> + spinlock_t wdt_lock; >> + struct timer_list wdt_timer; >> u32 tlbcfg[4]; >> u32 mmucfg; >> u32 epr; >> @@ -482,6 +484,7 @@ struct kvm_vcpu_arch { >> u8 osi_needed; >> u8 osi_enabled; >> u8 papr_enabled; >> + u8 watchdog_enabled; >> u8 sane; >> u8 cpu_type; >> u8 hcall_needed; >> diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h >> index 0124937..823d563 100644 >> --- a/arch/powerpc/include/asm/kvm_ppc.h >> +++ b/arch/powerpc/include/asm/kvm_ppc.h >> @@ -68,6 +68,8 @@ extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); >> extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); >> extern void kvmppc_decrementer_func(unsigned long data); >> extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu); >> +extern int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu); >> +extern void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu); >> >> /* Core-specific hooks */ >> >> @@ -104,6 +106,8 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, >> struct kvm_interrupt *irq); >> extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, >> struct kvm_interrupt *irq); >> +extern void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu); >> +extern void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu); >> >> extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, >> unsigned int op, int *advance); >> diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h >> index 2d916c4..e07e6af 100644 >> --- a/arch/powerpc/include/asm/reg_booke.h >> +++ b/arch/powerpc/include/asm/reg_booke.h >> @@ -539,6 +539,13 @@ >> #define TCR_FIE 0x00800000 /* FIT Interrupt Enable */ >> #define TCR_ARE 0x00400000 /* Auto Reload Enable */ >> >> +#ifdef CONFIG_E500 >> +#define TCR_GET_WP(tcr) ((((tcr) & 0xC0000000) >> 30) | \ >> + (((tcr) & 0x1E0000) >> 15)) >> +#else >> +#define TCR_GET_WP(tcr) (((tcr) & 0xC0000000) >> 30) >> +#endif >> + >> /* Bit definitions for the TSR. */ >> #define TSR_ENW 0x80000000 /* Enable Next Watchdog */ >> #define TSR_WIS 0x40000000 /* WDT Interrupt Status */ >> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c >> index 3f2a836..e946665 100644 >> --- a/arch/powerpc/kvm/book3s.c >> +++ b/arch/powerpc/kvm/book3s.c >> @@ -411,6 +411,15 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) >> return 0; >> } >> >> +int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) >> +{ >> + return 0; >> +} >> + >> +void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) >> +{ >> +} >> + >> int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) >> { >> int i; >> diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c >> index d25a097..eadd86c 100644 >> --- a/arch/powerpc/kvm/booke.c >> +++ b/arch/powerpc/kvm/booke.c >> @@ -206,6 +206,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, >> clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); >> } >> >> +void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu) >> +{ >> + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG); >> +} >> + >> +void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu) >> +{ >> + clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions); >> +} >> + > > These should be static, since we only need them in booke specific code. > >> static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) >> { >> #ifdef CONFIG_KVM_BOOKE_HV >> @@ -325,6 +335,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, >> msr_mask = MSR_CE | MSR_ME | MSR_DE; >> int_class = INT_CLASS_NONCRIT; >> break; >> + case BOOKE_IRQPRIO_WATCHDOG: >> case BOOKE_IRQPRIO_CRITICAL: >> case BOOKE_IRQPRIO_DBELL_CRIT: >> allowed = vcpu->arch.shared->msr & MSR_CE; >> @@ -404,12 +415,114 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, >> return allowed; >> } >> >> +/* >> + * Return the number of jiffies until the next timeout. If the timeout is >> + * longer than the NEXT_TIMER_MAX_DELTA, then return NEXT_TIMER_MAX_DELTA >> + * because the larger value can break the timer APIs. >> + */ >> +static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu) >> +{ >> + u64 tb, wdt_tb, wdt_ticks = 0; >> + u64 nr_jiffies = 0; >> + u32 period = TCR_GET_WP(vcpu->arch.tcr); >> + >> + wdt_tb = 1ULL << (63 - period); >> + tb = get_tb(); >> + /* >> + * The watchdog timeout will hapeen when TB bit corresponding >> + * to watchdog will toggle from 0 to 1. >> + */ >> + if (tb & wdt_tb) >> + wdt_ticks = wdt_tb; >> + >> + wdt_ticks += wdt_tb - (tb & (wdt_tb - 1)); >> + >> + /* Convert timebase ticks to jiffies */ >> + nr_jiffies = wdt_ticks; >> + >> + if (do_div(nr_jiffies, tb_ticks_per_jiffy)) >> + nr_jiffies++; >> + >> + return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA); >> +} >> + >> +static void arm_next_watchdog(struct kvm_vcpu *vcpu) >> +{ >> + unsigned long nr_jiffies; >> + unsigned long flags; >> + >> + spin_lock_irqsave(&vcpu->arch.wdt_lock, flags); >> + nr_jiffies = watchdog_next_timeout(vcpu); >> + /* >> + * If the number of jiffies of watchdog timer >= NEXT_TIMER_MAX_DELTA >> + * then do not run the watchdog timer as this can break timer APIs. >> + */ >> + if (nr_jiffies < NEXT_TIMER_MAX_DELTA) >> + mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies); >> + else >> + del_timer(&vcpu->arch.wdt_timer); >> + spin_unlock_irqrestore(&vcpu->arch.wdt_lock, flags); >> +} >> + >> +void kvmppc_watchdog_func(unsigned long data) >> +{ >> + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; >> + u32 tsr, new_tsr; >> + int final; >> + >> + do { >> + new_tsr = tsr = vcpu->arch.tsr; >> + final = 0; >> + >> + /* Time out event */ >> + if (tsr & TSR_ENW) { >> + if (tsr & TSR_WIS) >> + final = 1; >> + else >> + new_tsr = tsr | TSR_WIS; >> + } else { >> + new_tsr = tsr | TSR_ENW; >> + } >> + } while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr); >> + >> + if (new_tsr & TSR_WIS) { >> + smp_wmb(); >> + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); >> + kvm_vcpu_kick(vcpu); >> + } >> + >> + /* >> + * If this is final watchdog expiry and some action is required >> + * then exit to userspace. >> + */ >> + if (final && (vcpu->arch.tcr & TCR_WRC_MASK) && >> + vcpu->arch.watchdog_enabled) { >> + smp_wmb(); >> + kvm_make_request(KVM_REQ_WATCHDOG, vcpu); >> + kvm_vcpu_kick(vcpu); >> + } >> + >> + /* >> + * Stop running the watchdog timer after final expiration to >> + * prevent the host from being flooded with timers if the >> + * guest sets a short period. >> + * Timers will resume when TSR/TCR is updated next time. >> + */ >> + if (!final) >> + arm_next_watchdog(vcpu); >> +} >> + >> static void update_timer_ints(struct kvm_vcpu *vcpu) >> { >> if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS)) >> kvmppc_core_queue_dec(vcpu); >> else >> kvmppc_core_dequeue_dec(vcpu); >> + >> + if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS)) >> + kvmppc_core_queue_watchdog(vcpu); >> + else >> + kvmppc_core_dequeue_watchdog(vcpu); >> } >> >> static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) >> @@ -516,6 +629,13 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) >> goto out; >> } >> >> + if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu) && >> + (vcpu->arch.tsr & (TSR_ENW | TSR_WIS))) { >> + kvm_run->exit_reason = KVM_EXIT_WATCHDOG; >> + ret = 0; >> + goto out; >> + } > > I think we want to push this into a generic function that checks for requests. Check out > > https://github.com/agraf/linux-2.6/commit/e8fe975b59b24777c6314addc9cea2b959454738 > > for what I did there, namely kvmppc_check_requests. If your case we obviously want a return value. I don't mind which patch goes in first, so if you add a nice and generic version first, I can easily base my mmu notifier patches on top of yours. > >> + >> kvm_guest_enter(); >> >> #ifdef CONFIG_PPC_FPU >> @@ -978,6 +1098,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, >> } >> } >> >> + if (kvm_check_request(KVM_REQ_WATCHDOG, vcpu) && >> + (vcpu->arch.tsr & (TSR_ENW | TSR_WIS))) { >> + run->exit_reason = KVM_EXIT_WATCHDOG; >> + r = RESUME_HOST | (r & RESUME_FLAG_NV); >> + } >> + >> return r; >> } >> >> @@ -1011,6 +1137,21 @@ int kvm_arch_vcpu_setup(struct kvm_vcpu *vcpu) >> return r; >> } >> >> +int kvmppc_subarch_vcpu_init(struct kvm_vcpu *vcpu) >> +{ >> + /* setup watchdog timer once */ >> + spin_lock_init(&vcpu->arch.wdt_lock); >> + setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func, >> + (unsigned long)vcpu); >> + >> + return 0; >> +} >> + >> +void kvmppc_subarch_vcpu_uninit(struct kvm_vcpu *vcpu) >> +{ >> + del_timer_sync(&vcpu->arch.wdt_timer); >> +} >> + >> int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) >> { >> int i; >> @@ -1106,7 +1247,13 @@ static int set_sregs_base(struct kvm_vcpu *vcpu, >> } >> >> if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) { >> + u32 old_tsr = vcpu->arch.tsr; >> + >> vcpu->arch.tsr = sregs->u.e.tsr; >> + >> + if ((old_tsr ^ vcpu->arch.tsr) & (TSR_ENW | TSR_WIS)) >> + arm_next_watchdog(vcpu); > > Just curious. Is this potentially racy with the watchdog timer, since we're not accessing tsr atomically. Userspace updating TSR is racy (not just with the watchdog). That's why userspace needs to set a special flag for it to happen. It's meant for reset, migration, etc. rather than normal runtime operations. This could be a problem if we want QEMU to clear the watchdog when resuming from debug halt -- we don't want to miss a decrementer that happens around the same time. Maybe we should have separate virtual registers (via one-reg) for each timer source. The only new raciness I can see is if we race with a watchdog timer that determines itself to be the final expiration, and thus doesn't run arm_next_watchdog itself. In practice, I think you'd need three watchdog timers to go off between during the above code, so it's not very realistic, but I suppose you could use a cmpxchg to avoid it. -Scott -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html