On 07/09/2012 12:34 PM, Bharat Bhushan wrote:
> This patch adds the watchdog emulation in KVM. The watchdog
> emulation is enabled by KVM_ENABLE_CAP(KVM_CAP_PPC_WDT) ioctl.
> The kernel timer are used for watchdog emulation and emulates
> h/w watchdog state machine. On watchdog timer expiry, it exit to QEMU
> if TCR.WRC is non ZERO. QEMU can reset/shutdown etc depending upon how
> it is configured.
>
> Signed-off-by: Liu Yu <yu.liu@xxxxxxxxxxxxx>
> Signed-off-by: Scott Wood <scottwood@xxxxxxxxxxxxx>
> Signed-off-by: Bharat Bhushan <bharat.bhushan@xxxxxxxxxxxxx>
> ---
> -v2:
> - Addressed the review comments
>
> arch/powerpc/include/asm/kvm_book3s.h | 5 ++
> arch/powerpc/include/asm/kvm_booke.h | 5 ++
> arch/powerpc/include/asm/kvm_host.h | 3 +
> arch/powerpc/include/asm/kvm_ppc.h | 3 +
> arch/powerpc/include/asm/reg_booke.h | 7 ++
> arch/powerpc/kvm/44x.c | 1 +
> arch/powerpc/kvm/booke.c | 128
+++++++++++++++++++++++++++++++++
> arch/powerpc/kvm/booke_emulate.c | 8 ++
> arch/powerpc/kvm/powerpc.c | 31 +++++++-
> include/linux/kvm.h | 2 +
> 10 files changed, 189 insertions(+), 4 deletions(-)
>
> diff --git a/arch/powerpc/include/asm/kvm_book3s.h
b/arch/powerpc/include/asm/kvm_book3s.h
> index f0e0c6a..7bbc6cd 100644
> --- a/arch/powerpc/include/asm/kvm_book3s.h
> +++ b/arch/powerpc/include/asm/kvm_book3s.h
> @@ -446,6 +446,11 @@ static inline bool
kvmppc_critical_section(struct kvm_vcpu *vcpu)
> }
> #endif
>
> +static inline u32 kvmppc_get_tsr_wrc(struct kvm_vcpu *vcpu)
> +{
> + return 0;
> +}
> +
> /* Magic register values loaded into r3 and r4 before the 'sc' assembly
> * instruction for the OSI hypercalls */
> #define OSI_SC_MAGIC_R3 0x113724FA
> diff --git a/arch/powerpc/include/asm/kvm_booke.h
b/arch/powerpc/include/asm/kvm_booke.h
> index b7cd335..e5b86c1 100644
> --- a/arch/powerpc/include/asm/kvm_booke.h
> +++ b/arch/powerpc/include/asm/kvm_booke.h
> @@ -100,4 +100,9 @@ static inline ulong kvmppc_get_msr(struct
kvm_vcpu *vcpu)
> {
> return vcpu->arch.shared->msr;
> }
> +
> +static inline u32 kvmppc_get_tsr_wrc(struct kvm_vcpu *vcpu)
> +{
> + return vcpu->arch.tsr & TCR_WRC_MASK;
> +}
> #endif /* __ASM_KVM_BOOKE_H__ */
> diff --git a/arch/powerpc/include/asm/kvm_host.h
b/arch/powerpc/include/asm/kvm_host.h
> index 50ea12f..01047f4 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -467,6 +467,8 @@ struct kvm_vcpu_arch {
> ulong fault_esr;
> ulong queued_dear;
> ulong queued_esr;
> + spinlock_t wdt_lock;
> + struct timer_list wdt_timer;
These should be booke specific. If you can't fit them anywhere else, at
least do an #ifdef for booke kvm around them, so we know where they belong.
> u32 tlbcfg[4];
> u32 mmucfg;
> u32 epr;
> @@ -482,6 +484,7 @@ struct kvm_vcpu_arch {
> u8 osi_needed;
> u8 osi_enabled;
> u8 papr_enabled;
> + u8 watchdog_enable;
> u8 sane;
> u8 cpu_type;
> u8 hcall_needed;
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h
b/arch/powerpc/include/asm/kvm_ppc.h
> index 0124937..e5cf4b9 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -67,6 +67,7 @@ extern int kvmppc_emulate_mmio(struct kvm_run *run,
struct kvm_vcpu *vcpu);
> extern void kvmppc_emulate_dec(struct kvm_vcpu *vcpu);
> extern u32 kvmppc_get_dec(struct kvm_vcpu *vcpu, u64 tb);
> extern void kvmppc_decrementer_func(unsigned long data);
> +extern void kvmppc_watchdog_func(unsigned long data);
> extern int kvmppc_sanity_check(struct kvm_vcpu *vcpu);
>
> /* Core-specific hooks */
> @@ -104,6 +105,8 @@ extern void kvmppc_core_queue_external(struct
kvm_vcpu *vcpu,
> struct kvm_interrupt *irq);
> extern void kvmppc_core_dequeue_external(struct kvm_vcpu *vcpu,
> struct kvm_interrupt *irq);
> +extern void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu);
> +extern void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu);
>
> extern int kvmppc_core_emulate_op(struct kvm_run *run, struct
kvm_vcpu *vcpu,
> unsigned int op, int *advance);
> diff --git a/arch/powerpc/include/asm/reg_booke.h
b/arch/powerpc/include/asm/reg_booke.h
> index 2d916c4..e07e6af 100644
> --- a/arch/powerpc/include/asm/reg_booke.h
> +++ b/arch/powerpc/include/asm/reg_booke.h
> @@ -539,6 +539,13 @@
> #define TCR_FIE 0x00800000 /* FIT Interrupt Enable */
> #define TCR_ARE 0x00400000 /* Auto Reload Enable */
>
> +#ifdef CONFIG_E500
> +#define TCR_GET_WP(tcr) ((((tcr) & 0xC0000000) >> 30) | \
> + (((tcr) & 0x1E0000) >> 15))
> +#else
> +#define TCR_GET_WP(tcr) (((tcr) & 0xC0000000) >> 30)
> +#endif
Ben and/or Kumar, mind to ack this bit?
> +
> /* Bit definitions for the TSR. */
> #define TSR_ENW 0x80000000 /* Enable Next Watchdog */
> #define TSR_WIS 0x40000000 /* WDT Interrupt Status */
> diff --git a/arch/powerpc/kvm/44x.c b/arch/powerpc/kvm/44x.c
> index 50e7dbc..a213aba 100644
> --- a/arch/powerpc/kvm/44x.c
> +++ b/arch/powerpc/kvm/44x.c
> @@ -28,6 +28,7 @@
> #include <asm/kvm_44x.h>
> #include <asm/kvm_ppc.h>
>
> +#include "booke.h"
Look 2 lines below :)
> #include "44x_tlb.h"
> #include "booke.h"
>
> diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
> index d25a097..97f54ea 100644
> --- a/arch/powerpc/kvm/booke.c
> +++ b/arch/powerpc/kvm/booke.c
> @@ -206,6 +206,16 @@ void kvmppc_core_dequeue_external(struct
kvm_vcpu *vcpu,
> clear_bit(BOOKE_IRQPRIO_EXTERNAL_LEVEL,
&vcpu->arch.pending_exceptions);
> }
>
> +void kvmppc_core_queue_watchdog(struct kvm_vcpu *vcpu)
> +{
> + kvmppc_booke_queue_irqprio(vcpu, BOOKE_IRQPRIO_WATCHDOG);
> +}
> +
> +void kvmppc_core_dequeue_watchdog(struct kvm_vcpu *vcpu)
> +{
> + clear_bit(BOOKE_IRQPRIO_WATCHDOG, &vcpu->arch.pending_exceptions);
> +}
> +
> static void set_guest_srr(struct kvm_vcpu *vcpu, unsigned long srr0,
u32 srr1)
> {
> #ifdef CONFIG_KVM_BOOKE_HV
> @@ -325,6 +335,7 @@ static int kvmppc_booke_irqprio_deliver(struct
kvm_vcpu *vcpu,
> msr_mask = MSR_CE | MSR_ME | MSR_DE;
> int_class = INT_CLASS_NONCRIT;
> break;
> + case BOOKE_IRQPRIO_WATCHDOG:
> case BOOKE_IRQPRIO_CRITICAL:
> case BOOKE_IRQPRIO_DBELL_CRIT:
> allowed = vcpu->arch.shared->msr & MSR_CE;
> @@ -404,12 +415,101 @@ static int kvmppc_booke_irqprio_deliver(struct
kvm_vcpu *vcpu,
> return allowed;
> }
>
> +/*
> + * Return the number of jiffies until the next timeout. If the
timeout is
> + * longer than the NEXT_TIMER_MAX_DELTA, that
then?
> return NEXT_TIMER_MAX_DELTA
> + * instead.
I can read code. The important piece of information in the comment is
missing: The reason.
> + */
> +static unsigned long watchdog_next_timeout(struct kvm_vcpu *vcpu)
> +{
> + u64 tb, wdt_tb, wdt_ticks = 0;
> + u64 nr_jiffies = 0;
> + u32 period = TCR_GET_WP(vcpu->arch.tcr);
> +
> + wdt_tb = 1ULL << (63 - period);
> + tb = get_tb();
> + /*
> + * The watchdog timeout will hapeen when TB bit corresponding
> + * to watchdog will toggle from 0 to 1.
> + */
> + if (tb & wdt_tb)
> + wdt_ticks = wdt_tb;
> +
> + wdt_ticks += wdt_tb - (tb & (wdt_tb - 1));
> +
> + /* Convert timebase ticks to jiffies */
> + nr_jiffies = wdt_ticks;
> +
> + if (do_div(nr_jiffies, tb_ticks_per_jiffy))
> + nr_jiffies++;
> +
> + return min_t(unsigned long long, nr_jiffies, NEXT_TIMER_MAX_DELTA);
> +}
> +
> +static void arm_next_watchdog(struct kvm_vcpu *vcpu)
> +{
> + unsigned long nr_jiffies;
> +
> + nr_jiffies = watchdog_next_timeout(vcpu);
> + spin_lock(&vcpu->arch.wdt_lock);
> + if (nr_jiffies < NEXT_TIMER_MAX_DELTA)
> + mod_timer(&vcpu->arch.wdt_timer, jiffies + nr_jiffies);
> + else
> + del_timer(&vcpu->arch.wdt_timer);
> + spin_unlock(&vcpu->arch.wdt_lock);
> +}
> +
> +void kvmppc_watchdog_func(unsigned long data)
> +{
> + struct kvm_vcpu *vcpu = (struct kvm_vcpu *)data;
> + u32 tsr, new_tsr;
> + int final;
> +
> + do {
> + new_tsr = tsr = vcpu->arch.tsr;
> + final = 0;
> +
> + /* Time out event */
> + if (tsr & TSR_ENW) {
> + if (tsr & TSR_WIS) {
> + new_tsr = (tsr & ~TCR_WRC_MASK) |
> + (vcpu->arch.tcr & TCR_WRC_MASK);
> + vcpu->arch.tcr &= ~TCR_WRC_MASK;
Can't we just poke the vcpu to exit the VM and do the above on its own?
This is the watdog expired case, right? I'd also prefer to have an
explicit event for the expiry than a special TSR check in the main loop.
Also call me sceptic on the reset of tcr. If our user space watchdog
event is "write a message", then we essentially want to hide the fact
that the watchdog expired from the guest, right? In that case, the
second time-out wouldn't do anything guest visible.
> + final = 1;
> + } else {
> + new_tsr = tsr | TSR_WIS;
> + }
> + } else {
> + new_tsr = tsr | TSR_ENW;
> + }
> + } while (cmpxchg(&vcpu->arch.tsr, tsr, new_tsr) != tsr);
> +
> + if (new_tsr & (TSR_WIS | TCR_WRC_MASK)) {
> + smp_wmb();
> + kvm_make_request(KVM_REQ_PENDING_TIMER, vcpu);
> + kvm_vcpu_kick(vcpu);
> + }
> +
> + /*
> + * Stop running the watchdog timer after final expiration to
> + * prevent the host from being flooded with timers if the
> + * guest sets a short period.
> + * Timers will resume when TSR/TCR is updated next time.
> + */
> + if (!final)
> + arm_next_watchdog(vcpu);
> +}
> static void update_timer_ints(struct kvm_vcpu *vcpu)
> {
> if ((vcpu->arch.tcr & TCR_DIE) && (vcpu->arch.tsr & TSR_DIS))
> kvmppc_core_queue_dec(vcpu);
> else
> kvmppc_core_dequeue_dec(vcpu);
> +
> + if ((vcpu->arch.tcr & TCR_WIE) && (vcpu->arch.tsr & TSR_WIS))
> + kvmppc_core_queue_watchdog(vcpu);
> + else
> + kvmppc_core_dequeue_watchdog(vcpu);
> }
>
> static void kvmppc_core_check_exceptions(struct kvm_vcpu *vcpu)
> @@ -516,6 +616,12 @@ int kvmppc_vcpu_run(struct kvm_run *kvm_run,
struct kvm_vcpu *vcpu)
> goto out;
> }
>
> + if (vcpu->arch.tsr & TCR_WRC_MASK) {
> + kvm_run->exit_reason = KVM_EXIT_WDT;
> + ret = 0;
> + goto out;
> + }
> +
> kvm_guest_enter();
>
> #ifdef CONFIG_PPC_FPU
> @@ -977,6 +1083,10 @@ int kvmppc_handle_exit(struct kvm_run *run,
struct kvm_vcpu *vcpu,
> kvmppc_account_exit(vcpu, SIGNAL_EXITS);
> }
> }
> + if (vcpu->arch.tsr & TCR_WRC_MASK) {
> + run->exit_reason = KVM_EXIT_WDT;
> + r = RESUME_HOST | (r & RESUME_FLAG_NV);
> + }
>
> return r;
> }
> @@ -1106,7 +1216,14 @@ static int set_sregs_base(struct kvm_vcpu *vcpu,
> }
>
> if (sregs->u.e.update_special & KVM_SREGS_E_UPDATE_TSR) {
> + u32 old_tsr = vcpu->arch.tsr;
> +
> vcpu->arch.tsr = sregs->u.e.tsr;
> +
> + if (vcpu->arch.watchdog_enable && ((old_tsr ^ vcpu->arch.tsr) &
> + (TSR_ENW | TSR_WIS | TCR_WRC_MASK)))
> + arm_next_watchdog(vcpu);
Can you please move all required checks into arm_next_watchdog and call
it unconditionally? Especially the watchdog_enable. Unless you want to
follow Scott's scheme and merely make the user space delivery depend on
watchdog_enable, which also works.
> +
> update_timer_ints(vcpu);
> }
>
> @@ -1267,6 +1384,8 @@ void kvmppc_core_commit_memory_region(struct
kvm *kvm,
> void kvmppc_set_tcr(struct kvm_vcpu *vcpu, u32 new_tcr)
> {
> vcpu->arch.tcr = new_tcr;
> + if (vcpu->arch.watchdog_enable)
> + arm_next_watchdog(vcpu);
> update_timer_ints(vcpu);
> }
>
> @@ -1281,6 +1400,15 @@ void kvmppc_set_tsr_bits(struct kvm_vcpu
*vcpu, u32 tsr_bits)
> void kvmppc_clr_tsr_bits(struct kvm_vcpu *vcpu, u32 tsr_bits)
> {
> clear_bits(tsr_bits, &vcpu->arch.tsr);
> +
> + /*
> + * We may have stopped the watchdog due to
> + * being stuck on final expiration.
> + */
> + if (vcpu->arch.watchdog_enable && (tsr_bits & (TSR_ENW |
> + TSR_WIS | TCR_WRC_MASK)))
> + arm_next_watchdog(vcpu);
> +
> update_timer_ints(vcpu);
> }
>
> diff --git a/arch/powerpc/kvm/booke_emulate.c
b/arch/powerpc/kvm/booke_emulate.c
> index 12834bb..5a66ade 100644
> --- a/arch/powerpc/kvm/booke_emulate.c
> +++ b/arch/powerpc/kvm/booke_emulate.c
> @@ -145,6 +145,14 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu
*vcpu, int sprn, ulong spr_val)
> kvmppc_clr_tsr_bits(vcpu, spr_val);
> break;
> case SPRN_TCR:
> + /*
> + * WRC is a 2-bit field that is supposed to preserve its
> + * value once written to non-zero.
> + */
> + if (vcpu->arch.tcr & TCR_WRC_MASK) {
> + spr_val &= ~TCR_WRC_MASK;
> + spr_val |= vcpu->arch.tcr & TCR_WRC_MASK;
> + }
> kvmppc_set_tcr(vcpu, spr_val);
> break;
>
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index 87f4dc8..646c4da 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -38,9 +38,13 @@
>
> int kvm_arch_vcpu_runnable(struct kvm_vcpu *v)
> {
> - return !(v->arch.shared->msr & MSR_WE) ||
> - !!(v->arch.pending_exceptions) ||
> - v->requests;
> + bool ret = !(v->arch.shared->msr & MSR_WE) ||
> + !!(v->arch.pending_exceptions) ||
> + v->requests;
> +
> + ret = ret || kvmppc_get_tsr_wrc(v);
Why do you need to declare the cpu as non-runnable when a watchdog event
occured?
> +
> + return ret;
> }
>
> int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu)
> @@ -237,6 +241,7 @@ int kvm_dev_ioctl_check_extension(long ext)
> case KVM_CAP_PPC_GET_PVINFO:
> #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
> case KVM_CAP_SW_TLB:
> + case KVM_CAP_PPC_WDT:
> #endif
> r = 1;
> break;
> @@ -386,13 +391,21 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
> #ifdef CONFIG_KVM_EXIT_TIMING
> mutex_init(&vcpu->arch.exit_timing_lock);
> #endif
> -
> +#ifdef CONFIG_BOOKE
> + spin_lock_init(&vcpu->arch.wdt_lock);
> +#endif
> return 0;
> }
>
> void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
> {
> kvmppc_mmu_destroy(vcpu);
> +#ifdef CONFIG_BOOKE
> + spin_lock(&vcpu->arch.wdt_lock);
> + if (vcpu->arch.watchdog_enable)
> + del_timer(&vcpu->arch.wdt_timer);
> + spin_unlock(&vcpu->arch.wdt_lock);
> +#endif
> }
>
> void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
> @@ -637,6 +650,16 @@ static int kvm_vcpu_ioctl_enable_cap(struct
kvm_vcpu *vcpu,
> r = 0;
> vcpu->arch.papr_enabled = true;
> break;
> +#ifdef CONFIG_BOOKE
> + case KVM_CAP_PPC_WDT:
> + r = 0;
> + /* setup watchdog timer once */
> + if (!vcpu->arch.watchdog_enable)
> + setup_timer(&vcpu->arch.wdt_timer,
> + kvmppc_watchdog_func, (unsigned long)vcpu);
> + vcpu->arch.watchdog_enable = true;
> + break;
> +#endif
> #if defined(CONFIG_KVM_E500V2) || defined(CONFIG_KVM_E500MC)
> case KVM_CAP_SW_TLB: {
> struct kvm_config_tlb cfg;
> diff --git a/include/linux/kvm.h b/include/linux/kvm.h
> index 2ce09aa..b9fdb52 100644
> --- a/include/linux/kvm.h
> +++ b/include/linux/kvm.h
> @@ -163,6 +163,7 @@ struct kvm_pit_config {
> #define KVM_EXIT_OSI 18
> #define KVM_EXIT_PAPR_HCALL 19
> #define KVM_EXIT_S390_UCONTROL 20
> +#define KVM_EXIT_WDT 21
Please make this a more generic KVM_EXIT_WATCHDOG so that other archs
may have the chance to reuse it.
Alex
>
> /* For KVM_EXIT_INTERNAL_ERROR */
> #define KVM_INTERNAL_ERROR_EMULATION 1
> @@ -618,6 +619,7 @@ struct kvm_ppc_smmu_info {
> #define KVM_CAP_PPC_GET_SMMU_INFO 78
> #define KVM_CAP_S390_COW 79
> #define KVM_CAP_PPC_ALLOC_HTAB 80
> +#define KVM_CAP_PPC_WDT 81
>
> #ifdef KVM_CAP_IRQ_ROUTING
>
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html