On 28.06.2012, at 08:17, Bharat Bhushan wrote: > This patch adds the watchdog emulation in KVM. The watchdog > emulation is enabled by KVM_ENABLE_CAP(KVM_CAP_PPC_WDT) ioctl. > The kernel timer are used for watchdog emulation and emulates > h/w watchdog state machine. On watchdog timer expiry, it exit to QEMU > if TCR.WRC is non ZERO. QEMU can reset/shutdown etc depending upon how > it is configured. > > Signed-off-by: Liu Yu <yu.liu@xxxxxxxxxxxxx> > Signed-off-by: Scott Wood <scottwood@xxxxxxxxxxxxx> > Signed-off-by: Bharat Bhushan <bharat.bhushan@xxxxxxxxxxxxx> > --- > arch/powerpc/include/asm/kvm_host.h | 2 + > arch/powerpc/include/asm/kvm_ppc.h | 3 + > arch/powerpc/include/asm/reg_booke.h | 2 + > arch/powerpc/kvm/44x.c | 1 + > arch/powerpc/kvm/booke.c | 125 ++++++++++++++++++++++++++++++++++ > arch/powerpc/kvm/booke_emulate.c | 6 ++- > arch/powerpc/kvm/powerpc.c | 25 ++++++- > include/linux/kvm.h | 2 + > 8 files changed, 162 insertions(+), 4 deletions(-) > > diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h > index 50ea12f..a9e5aed 100644 > --- a/arch/powerpc/include/asm/kvm_host.h > +++ b/arch/powerpc/include/asm/kvm_host.h > @@ -467,6 +467,7 @@ struct kvm_vcpu_arch { > ulong fault_esr; > ulong queued_dear; > ulong queued_esr; > + struct timer_list wdt_timer; > u32 tlbcfg[4]; > u32 mmucfg; > u32 epr; > @@ -482,6 +483,7 @@ struct kvm_vcpu_arch { > u8 osi_needed; > u8 osi_enabled; > u8 papr_enabled; > + u8 watchdog_enable; > u8 sane; > u8 cpu_type; > u8 hcall_needed; > diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h > index 0124937..e5cf4b9 100644 > --- a/arch/powerpc/include/asm/kvm_ppc.h > +++ b/arch/powerpc/include/asm/kvm_ppc.h > @@ -67,6 +67,7 @@ extern int kvmppc_emulate_mmio(struct kvm_run *run, struct kvm_vcpu *vcpu); > extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu); > extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb); > extern void kvmppc_decrementer_func(unsigned long data); > +extern void kvmppc_watchdog_func(unsigned long data); > extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu); > > /* Core-specific hooks */ > @@ -104,6 +105,8 @@ extern void kvmppc_core_queue_external(struct kvm_vcpu *vcpu, > struct kvm_interrupt *irq); > extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, > struct kvm_interrupt *irq); > +extern void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu); > +extern void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu); > > extern int kvmppc_core_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu, > unsigned int op, int *advance); > diff --git a/arch/powerpc/include/asm/reg_booke.h b/arch/powerpc/include/asm/reg_booke.h > index 2d916c4..7efb3f5 100644 > --- a/arch/powerpc/include/asm/reg_booke.h > +++ b/arch/powerpc/include/asm/reg_booke.h > @@ -538,6 +538,8 @@ > #define FP_2_21 3 /* 2^21 clocks */ > #define TCR_FIE 0x00800000 /* FIT Interrupt Enable */ > #define TCR_ARE 0x00400000 /* Auto Reload Enable */ > +#define TCR_GET_FSL_WP(tcr) ((((tcr) & 0xC0000000) >> 30) | \ > + (((tcr) & 0x1E0000) >> 15)) > > /* Bit definitions for the TSR. */ > #define TSR_ENW 0x80000000 /* Enable Next Watchdog */ > diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c > index 50e7dbc..a213aba 100644 > --- a/arch/powerpc/kvm/44x.c > +++ b/arch/powerpc/kvm/44x.c > @@ -28,6 +28,7 @@ > #include <asm/kvm_44x.h> > #include <asm/kvm_ppc.h> > > +#include "booke.h" > #include "44x_tlb.h" > #include "booke.h" > > diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c > index d25a097..ad30f75 100644 > --- a/arch/powerpc/kvm/booke.c > +++ b/arch/powerpc/kvm/booke.c > @@ -206,6 +206,16 @@ void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu, > clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL, &vcpu->arch.pending_exceptions); > } > > +void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu) > +{ > + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG); > +} > + > +void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu) > +{ > + clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions); > +} > + > static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0, u32 srr1) > { > #ifdef CONFIG_KVM_BOOKE_HV > @@ -325,6 +335,7 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, > msr_mask = MSR_CE | MSR_ME | MSR_DE; > int_class = INT_CLASS_NONCRIT; > break; > + case BOOKE_IRQPRIO_WATCHDOG: > case BOOKE_IRQPRIO_CRITICAL: > case BOOKE_IRQPRIO_DBELL_CRIT: > allowed = vcpu->arch.shared->msr & MSR_CE; > @@ -404,12 +415,98 @@ static int kvmppc_booke_irqprio_deliver(struct kvm_vcpu *vcpu, > return allowed; > } > > +/* > + * The timer system can almost deal with LONG_MAX timeouts, except that > + * when you get very close to LONG_MAX, the slack added can cause overflow. > + * > + * LONG_MAX/2 is a conservative threshold, but it should be adequate for > + * any realistic use. > + */ > +#define MAX_TIMEOUT (LONG_MAX/2) Should this really be in kvm code? > + > +/* > + * Return the number of jiffies until the next timeout. If the timeout is > + * longer than the MAX_TIMEOUT, that we return MAX_TIMEOUT instead. > + */ > +static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu) > +{ > + unsigned long long tb, mask, nr_jiffies = 0; u64? > + u32 period = TCR_GET_FSL_WP(vcpu->arch.tcr); Doesn't sound like something booke generic to me. > + > + mask = 1ULL << (63 - period); > + tb = get_tb(); > + if (tb & mask) > + nr_jiffies += mask; To be honest, you lost me here. nr_jiffies is jiffies, right? While mask is basically in timebase granularity. I suppose you're just reusing the variable here for no good reason - the compiler will gladly optimize things for you if you write things a bit more verbose :). Please make this function more readable. With comments if needed. > + > + nr_jiffies += mask - (tb & (mask - 1)); > + > + if (do_div(nr_jiffies, tb_ticks_per_jiffy)) > + nr_jiffies++; > + > + return min_t(unsigned long long, nr_jiffies, MAX_TIMEOUT); > +} > + > +static void arm_next_watchdog(struct kvm_vcpu *vcpu) > +{ > + unsigned long nr_jiffies; > + > + nr_jiffies = watchdog_next_timeout(vcpu); > + if (nr_jiffies < MAX_TIMEOUT) > + mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies); > + else > + del_timer(&vcpu->arch.wdt_timer); Can you del a timer that's not armed? Could that ever happen in this case? Also, could the timer possibly be running somewhere, so do we need del_timer_sync? Or don't we need to care? > +} > + > +void kvmppc_watchdog_func(unsigned long data) > +{ > + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data; > + u32 tsr, new_tsr; > + int final; > + > + do { > + new_tsr = tsr = vcpu->arch.tsr; > + final = 0; > + > + /* Time out event */ > + if (tsr & TSR_ENW) { > + if (tsr & TSR_WIS) { > + new_tsr = (tsr & ~TCR_WRC_MASK) | > + (vcpu->arch.tcr & TCR_WRC_MASK); > + vcpu->arch.tcr &= ~TCR_WRC_MASK; > + final = 1; > + } else { > + new_tsr = tsr | TSR_WIS; > + } > + } else { > + new_tsr = tsr | TSR_ENW; > + } > + } while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr); > + > + if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) { > + smp_wmb(); > + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu); > + kvm_vcpu_kick(vcpu); > + } > + > + /* > + * Avoid getting a storm of timers if the guest sets > + * the period very short. We'll restart it if anything > + * changes. > + */ > + if (!final) > + arm_next_watchdog(vcpu); Mind to explain this part a bit further? > +} > static void update_timer_ints(struct kvm_vcpu *vcpu) > { > if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS)) > kvmppc_core_queue_dec(vcpu); > else > kvmppc_core_dequeue_dec(vcpu); > + > + if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS)) > + kvmppc_core_queue_watchdog(vcpu); > + else > + kvmppc_core_dequeue_watchdog(vcpu); > } > > static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu) > @@ -516,6 +613,12 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) > goto out; > } > > + if (vcpu->arch.tsr & TCR_WRC_MASK) { > + kvm_run->exit_reason = KVM_EXIT_WDT; > + ret = 0; > + goto out; > + } > + > kvm_guest_enter(); > > #ifdef CONFIG_PPC_FPU > @@ -977,6 +1080,10 @@ int kvmppc_handle_exit(struct kvm_run *run, struct kvm_vcpu *vcpu, > kvmppc_account_exit(vcpu, SIGNAL_EXITS); > } > } > + if (vcpu->arch.tsr & TCR_WRC_MASK) { > + run->exit_reason = KVM_EXIT_WDT; > + r = RESUME_HOST | (r & RESUME_FLAG_NV); > + } > > return r; > } > @@ -1106,7 +1213,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu, > } > > if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) { > + u32 old_tsr = vcpu->arch.tsr; > + > vcpu->arch.tsr = sregs->u.e.tsr; > + > + if ((old_tsr ^ vcpu->arch.tsr) & > + (TSR_ENW | TSR_WIS | TCR_WRC_MASK)) > + arm_next_watchdog(vcpu); Why isn't this one guarded by vcpu->arch.watchdog_enable? > + > update_timer_ints(vcpu); > } > > @@ -1267,6 +1381,8 @@ void kvmppc_core_commit_memory_region(struct kvm *kvm, > void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr) > { > vcpu->arch.tcr = new_tcr; > + if (vcpu->arch.watchdog_enable) > + arm_next_watchdog(vcpu); > update_timer_ints(vcpu); > } > > @@ -1281,6 +1397,15 @@ void kvmppc_set_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits) > void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits) > { > clear_bits(tsr_bits, &vcpu->arch.tsr); > + > + /* > + * We may have stopped the watchdog due to > + * being stuck on final expiration. > + */ > + if (vcpu->arch.watchdog_enable && (tsr_bits & (TSR_ENW | > + TSR_WIS | TCR_WRC_MASK))) > + arm_next_watchdog(vcpu); > + > update_timer_ints(vcpu); > } > > diff --git a/arch/powerpc/kvm/booke_emulate.c b/arch/powerpc/kvm/booke_emulate.c > index 12834bb..cc94d42 100644 > --- a/arch/powerpc/kvm/booke_emulate.c > +++ b/arch/powerpc/kvm/booke_emulate.c > @@ -145,7 +145,11 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, int sprn, ulong spr_val) > kvmppc_clr_tsr_bits(vcpu, spr_val); > break; > case SPRN_TCR: > - kvmppc_set_tcr(vcpu, spr_val); > + /* WRC can only be programmed when WRC=0 */ > + if (TCR_WRC_MASK & vcpu->arch.tcr) Please reverse the order. > + spr_val &= ~TCR_WRC_MASK; > + kvmppc_set_tcr(vcpu, > + spr_val | (TCR_WRC_MASK & vcpu->arch.tcr)); In fact, what you're trying to do here is keep TCR_WRC always on when it was enabled once. So all you need is the OR here. No need for the mask above. > break; > > case SPRN_DECAR: > diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c > index 87f4dc8..35a1ff3 100644 > --- a/arch/powerpc/kvm/powerpc.c > +++ b/arch/powerpc/kvm/powerpc.c > @@ -38,9 +38,15 @@ > > int kvm_arch_vcpu_runnable(struct kvm_vcpu *v) > { > - return !(v->arch.shared->msr & MSR_WE) || > - !!(v->arch.pending_exceptions) || > - v->requests; > + bool ret = !(v->arch.shared->msr & MSR_WE) || > + !!(v->arch.pending_exceptions) || > + v->requests; > + > +#ifdef CONFIG_BOOKE > + ret = ret || (v->arch.tsr & TCR_WRC_MASK); Please make this a callback. In a header if you think it's performance critical, but I don't want to see #ifdef CONFIG_BOOKE too often in powerpc.c. > +#endif > + > + return ret; > } > > int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) > @@ -237,6 +243,7 @@ int kvm_dev_ioctl_check_extension(long ext) > case KVM_CAP_PPC_GET_PVINFO: > #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC) > case KVM_CAP_SW_TLB: > + case KVM_CAP_PPC_WDT: > #endif > r = 1; > break; > @@ -393,6 +400,10 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu) > void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) > { > kvmppc_mmu_destroy(vcpu); > +#ifdef CONFIG_BOOKE > + if (vcpu->arch.watchdog_enable) > + del_timer(&vcpu->arch.wdt_timer); > +#endif > } > > void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) > @@ -637,6 +648,14 @@ static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, > r = 0; > vcpu->arch.papr_enabled = true; > break; > +#ifdef CONFIG_BOOKE > + case KVM_CAP_PPC_WDT: > + r = 0; > + vcpu->arch.watchdog_enable = true; > + setup_timer(&vcpu->arch.wdt_timer, kvmppc_watchdog_func, > + (unsigned long)vcpu); Shouldn't we guard against user space calling this twice? Alex -- To unsubscribe from this list: send the line "unsubscribe kvm" in the body of a message to majordomo@xxxxxxxxxxxxxxx More majordomo info at http://vger.kernel.org/majordomo-info.html