patch for virtual machine oriented scheduling(3)

alex <tomorrowanewday@xxxxxxxxx> · Wed, 22 Apr 2009 22:55:24 +0800

the code for credit scheduler
-----------------------------------------------------------------------------------------------------------------------

diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 4d76bb6..9e88ff0 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -284,7 +284,7 @@ int kvm_apic_match_dest(struct kvm_vcpu *vcpu,
struct kvm_lapic *source,
                  "dest_mode 0x%x, short_hand 0x%x\n",
                  target, source, dest, dest_mode, short_hand);

-       ASSERT(!target);
+       ASSERT(target);
       switch (short_hand) {
       case APIC_DEST_NOSHORT:
               if (dest_mode == 0)
diff --git a/arch/x86/kvm/sched_credit.c b/arch/x86/kvm/sched_credit.c
new file mode 100644
index 0000000..77c726a
--- /dev/null
+++ b/arch/x86/kvm/sched_credit.c
@@ -0,0 +1,1604 @@
+/****************************************************************************
+ * (C) 2005-2006 - Emmanuel Ackaouy - XenSource Inc.
+ * (C) 2009     - Xiaojian Liu
+ ****************************************************************************
+ *
+ *        File: common/csched_credit.c
+ *      Author: Emmanuel Ackaouy
+ *     Updated by Xiaojian Liu to make it runnable under KVM
+ *
+ * Description: Credit-based SMP CPU scheduler
+ */
+#include <linux/cpumask.h>
+#include <linux/kvm_host.h>
+#include <linux/trace.h>
+#include <linux/sched-if.h>
+#include <linux/ipi.h>
+#include <linux/hardirq.h>
+
+#ifdef DEBUG
+#define ASSERT(x)                                                      \
+do {                                                                   \
+       if (!(x)) {                                                     \
+               printk(KERN_EMERG "assertion failed %s: %d: %s\n",      \
+                      __FILE__, __LINE__, #x);                         \
+               BUG();                                                  \
+       }                                                               \
+} while (0)
+#else
+#define ASSERT(x) do { } while (0)
+#endif
+/*
+ * CSCHED_STATS
+ *
+ * Manage very basic counters and stats.
+ *
+ * Useful for debugging live systems. The stats are displayed
+ * with runq dumps ('r' on the Xen console).
+ */
+#define CSCHED_STATS
+
+#define MILLISECS(_ms)  ((s_time_t)((_ms) * 1000000ULL))
+
+/*
+ * Basic constants
+ */
+#define CSCHED_DEFAULT_WEIGHT       256
+#define CSCHED_TICKS_PER_TSLICE     3
+#define CSCHED_TICKS_PER_ACCT       3
+#define CSCHED_MSECS_PER_TICK       10
+#define CSCHED_MSECS_PER_TSLICE     \
+    (CSCHED_MSECS_PER_TICK * CSCHED_TICKS_PER_TSLICE)
+#define CSCHED_CREDITS_PER_TICK     100
+#define CSCHED_CREDITS_PER_TSLICE   \
+    (CSCHED_CREDITS_PER_TICK * CSCHED_TICKS_PER_TSLICE)
+#define CSCHED_CREDITS_PER_ACCT     \
+    (CSCHED_CREDITS_PER_TICK * CSCHED_TICKS_PER_ACCT)
+
+
+/*
+ * Priorities
+ */
+#define CSCHED_PRI_TS_BOOST      0      /* time-share waking up */
+#define CSCHED_PRI_TS_UNDER     -1      /* time-share w/ credits */
+#define CSCHED_PRI_TS_OVER      -2      /* time-share w/o credits */
+#define CSCHED_PRI_IDLE         -64     /* idle */
+
+
+/*
+ * Flags
+ */
+#define CSCHED_FLAG_VCPU_PARKED 0x0001  /* VCPU over capped credits */
+
+
+/*
+ * Useful macros
+ */
+#define CSCHED_PCPU(_c)     \
+    ((struct csched_pcpu *)per_cpu(schedule_data, _c).sched_priv)
+#define CSCHED_VCPU(_vcpu)  ((struct csched_vcpu *) (_vcpu)->sched_priv)
+#define CSCHED_VM(_dom)    ((struct csched_vm *) (_dom)->sched_priv)
+#define RUNQ(_cpu)          (&(CSCHED_PCPU(_cpu)->runq))
+
+
+/*
+ * Stats
+ */
+#ifdef CSCHED_STATS
+
+#define CSCHED_STAT(_X)         (csched_priv.stats._X)
+#define CSCHED_STAT_DEFINE(_X)  uint32_t _X;
+#define CSCHED_STAT_PRINTK(_X)                                  \
+    do                                                          \
+    {                                                           \
+        printk("\t%-30s = %u\n", #_X, CSCHED_STAT(_X));  \
+    } while ( 0 );
+
+/*
+ * Try and keep often cranked stats on top so they'll fit on one
+ * cache line.
+ */
+#define CSCHED_STATS_EXPAND_SCHED(_MACRO)   \
+    _MACRO(schedule)                        \
+    _MACRO(acct_run)                        \
+    _MACRO(acct_no_work)                    \
+    _MACRO(acct_balance)                    \
+    _MACRO(acct_reorder)                    \
+    _MACRO(acct_min_credit)                 \
+    _MACRO(acct_vcpu_active)                \
+    _MACRO(acct_vcpu_idle)                  \
+    _MACRO(vcpu_sleep)                      \
+    _MACRO(vcpu_wake_running)               \
+    _MACRO(vcpu_wake_onrunq)                \
+    _MACRO(vcpu_wake_runnable)              \
+    _MACRO(vcpu_wake_not_runnable)          \
+    _MACRO(vcpu_park)                       \
+    _MACRO(vcpu_unpark)                     \
+    _MACRO(tickle_local_idler)              \
+    _MACRO(tickle_local_over)               \
+    _MACRO(tickle_local_under)              \
+    _MACRO(tickle_local_other)              \
+    _MACRO(tickle_idlers_none)              \
+    _MACRO(tickle_idlers_some)              \
+    _MACRO(load_balance_idle)               \
+    _MACRO(load_balance_over)               \
+    _MACRO(load_balance_other)              \
+    _MACRO(steal_trylock_failed)            \
+    _MACRO(steal_peer_idle)                 \
+    _MACRO(migrate_queued)                  \
+    _MACRO(migrate_running)                 \
+    _MACRO(dom_init)                        \
+    _MACRO(dom_destroy)                     \
+    _MACRO(vcpu_init)                       \
+    _MACRO(vcpu_destroy)
+
+#ifndef NDEBUG
+#define CSCHED_STATS_EXPAND_CHECKS(_MACRO)  \
+    _MACRO(vcpu_check)
+#else
+#define CSCHED_STATS_EXPAND_CHECKS(_MACRO)
+#endif
+
+#define CSCHED_STATS_EXPAND(_MACRO)         \
+    CSCHED_STATS_EXPAND_CHECKS(_MACRO)      \
+    CSCHED_STATS_EXPAND_SCHED(_MACRO)
+
+#define CSCHED_STATS_RESET()                                        \
+    do                                                              \
+    {                                                               \
+        memset(&csched_priv.stats, 0, sizeof(csched_priv.stats));   \
+    } while ( 0 )
+
+#define CSCHED_STATS_DEFINE()                   \
+    struct                                      \
+    {                                           \
+        CSCHED_STATS_EXPAND(CSCHED_STAT_DEFINE) \
+    } stats;
+
+#define CSCHED_STATS_PRINTK()                   \
+    do                                          \
+    {                                           \
+        printk("stats:\n");                     \
+        CSCHED_STATS_EXPAND(CSCHED_STAT_PRINTK) \
+    } while ( 0 )
+
+#define CSCHED_STAT_CRANK(_X)               (CSCHED_STAT(_X)++)
+
+#define CSCHED_VCPU_STATS_RESET(_V)                     \
+    do                                                  \
+    {                                                   \
+        memset(&(_V)->stats, 0, sizeof((_V)->stats));   \
+    } while ( 0 )
+
+#define CSCHED_VCPU_STAT_CRANK(_V, _X)      (((_V)->stats._X)++)
+
+#define CSCHED_VCPU_STAT_SET(_V, _X, _Y)    (((_V)->stats._X) = (_Y))
+
+#else /* CSCHED_STATS */
+
+#define CSCHED_STATS_RESET()                do {} while ( 0 )
+#define CSCHED_STATS_DEFINE()
+#define CSCHED_STATS_PRINTK()               do {} while ( 0 )
+#define CSCHED_STAT_CRANK(_X)               do {} while ( 0 )
+#define CSCHED_VCPU_STATS_RESET(_V)         do {} while ( 0 )
+#define CSCHED_VCPU_STAT_CRANK(_V, _X)      do {} while ( 0 )
+#define CSCHED_VCPU_STAT_SET(_V, _X, _Y)    do {} while ( 0 )
+
+#endif /* CSCHED_STATS */
+
+
+/*
+ * Physical CPU
+ */
+struct csched_pcpu {
+    struct list_head runq;
+    uint32_t runq_sort_last;
+    struct hrtimer ticker;
+    unsigned int tick;
+    int cpu;
+    volatile bool in_use;
+};
+
+/*
+ * Virtual CPU
+ */
+struct csched_vcpu {
+    struct list_head runq_elem;
+    struct list_head active_vcpu_elem;
+    struct csched_vm *sdom;
+    struct kvm_vcpu *vcpu;
+    atomic_t credit;
+    uint16_t flags;
+    int16_t pri;
+#ifdef CSCHED_STATS
+    struct {
+        int credit_last;
+        uint32_t credit_incr;
+        uint32_t state_active;
+        uint32_t state_idle;
+        uint32_t migrate_q;
+        uint32_t migrate_r;
+    } stats;
+#endif
+};
+
+/*
+ * Domain
+ */
+struct csched_vm{
+    struct list_head active_vcpu;
+    struct list_head active_sdom_elem;
+    struct kvm *vm;
+    uint16_t active_vcpu_count;
+    uint16_t weight;
+    uint16_t cap;
+};
+
+/*
+ * System-wide private data
+ */
+struct csched_private {
+    spinlock_t lock;
+    struct list_head active_sdom;
+    uint32_t ncpus;
+    unsigned int master;
+    cpumask_t idlers;
+    uint32_t weight;
+    uint32_t credit;
+    int credit_balance;
+    uint32_t runq_sort;
+    CSCHED_STATS_DEFINE()
+};
+
+/*
+ * Global variables
+ */
+static struct csched_private csched_priv;
+
+static enum hrtimer_restart  csched_tick_stub(struct hrtimer *data);
+DECLARE_PER_CPU(cpumask_t, cpu_core_map);
+DECLARE_PER_CPU(cpumask_t, cpu_sibling_map);
+
+extern void vcpu_pause(struct kvm_vcpu *v);
+extern void vcpu_unpause(struct kvm_vcpu *v);
+extern void vcpu_pause_nosync(struct kvm_vcpu *v);
+
+static void csched_disable(int cpu)
+{
+    struct csched_pcpu *spc = CSCHED_PCPU(cpu);
+    tasklet_kill(&per_cpu(schedule_data, cpu).tick_tasklet);
+}
+void kvm_try_tasklet_schedule(void *_unused)
+{
+    int cpu = get_cpu();
+    TRACE_2D(TRC_TASKLET_SCHEDULE, cpu, __LINE__);
+    if (!spin_is_locked(&per_cpu(schedule_data, cpu).schedule_lock)){
+       tasklet_schedule(&per_cpu(schedule_data, cpu).sched_tasklet);
+       TRACE_2D(TRC_TASKLET_SCHEDULE, cpu, __LINE__);
+    }
+    put_cpu();
+}
+static inline int
+__cycle_cpu(int cpu, const cpumask_t *mask)
+{
+    int nxt = next_cpu(cpu, *mask);
+    if (nxt == NR_CPUS)
+        nxt = first_cpu(*mask);
+    return nxt;
+}
+
+static inline int
+__vcpu_on_runq(struct csched_vcpu *svc)
+{
+    return !list_empty(&svc->runq_elem);
+}
+
+static inline struct csched_vcpu *
+__runq_elem(struct list_head *elem)
+{
+    return list_entry(elem, struct csched_vcpu, runq_elem);
+}
+
+static inline void
+__runq_insert(unsigned int cpu, struct csched_vcpu *svc)
+{
+    const struct list_head * const runq = RUNQ(cpu);
+    struct list_head *iter;
+
+    BUG_ON( __vcpu_on_runq(svc) );
+    BUG_ON( cpu != svc->vcpu->processor);
+
+    list_for_each( iter, runq )
+    {
+        const struct csched_vcpu * const iter_svc = __runq_elem(iter);
+        if ( svc->pri > iter_svc->pri )
+            break;
+    }
+
+    list_add_tail(&svc->runq_elem, iter);
+}
+
+static inline void
+__runq_remove(struct csched_vcpu *svc)
+{
+    BUG_ON( !__vcpu_on_runq(svc) );
+    list_del_init(&svc->runq_elem);
+}
+
+static inline void
+__runq_tickle(unsigned int cpu, struct csched_vcpu *new)
+{
+    struct csched_vcpu * const cur =
+        CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
+    cpumask_var_t cpus;
+
+    int my_cpu = get_cpu();
+
+    ASSERT(cur);
+    if(alloc_cpumask_var(&cpus, GFP_ATOMIC))
+       cpumask_clear(cpus);
+
+    /* If strictly higher priority than current VCPU, signal the CPU */
+    if ( new->pri > cur->pri )
+    {
+        if ( cur->pri == CSCHED_PRI_IDLE )
+            CSCHED_STAT_CRANK(tickle_local_idler);
+        else if ( cur->pri == CSCHED_PRI_TS_OVER )
+            CSCHED_STAT_CRANK(tickle_local_over);
+        else if ( cur->pri == CSCHED_PRI_TS_UNDER )
+            CSCHED_STAT_CRANK(tickle_local_under);
+        else
+            CSCHED_STAT_CRANK(tickle_local_other);
+
+       cpumask_set_cpu(cpu, cpus);
+    }
+
+    /*
+     * If this CPU has at least two runnable VCPUs, we tickle any idlers to
+     * let them know there is runnable work in the system...
+     */
+    if ( cur->pri > CSCHED_PRI_IDLE )
+    {
+        if ( cpumask_empty(&csched_priv.idlers) )
+        {
+            CSCHED_STAT_CRANK(tickle_idlers_none);
+        }
+        else
+        {
+            CSCHED_STAT_CRANK(tickle_idlers_some);
+            cpumask_or(cpus, cpus, &csched_priv.idlers);
+            cpumask_and(cpus, cpus, &new->vcpu->cpu_affinity);
+        }
+    }
+
+    /* Send scheduler interrupts to designated CPUs */
+    if (cpu_isset(my_cpu, *cpus)){
+       kvm_try_tasklet_schedule(NULL);
+       cpu_clear(my_cpu, cpus);
+    }
+
+    if ( !cpumask_empty(cpus) ){
+       int r;
+       r = insert_pending_ipi(my_cpu, *cpus,
kvm_try_tasklet_schedule, NULL, 1);
+       if (r) {
+           dump_traces(NULL);
+           BUG_ON(1);
+       }
+       wake_up(&per_cpu(schedule_data, my_cpu).ipi_wq);
+    }
+    free_cpumask_var(cpus);
+    put_cpu();
+}
+
+static int
+csched_pcpu_init(int cpu)
+{
+       struct csched_pcpu *spc;
+       unsigned long flags;
+       printk("cpu %d exec func %s\n", cpu, __FUNCTION__);
+
+       /* Allocate per-PCPU info */
+       spc = kmalloc(sizeof(struct csched_pcpu), GFP_KERNEL);
+       if ( spc == NULL )
+           return -1;
+
+       spin_lock_irqsave(&csched_priv.lock, flags);
+
+       /* Initialize/update system-wide config */
+       csched_priv.credit += CSCHED_CREDITS_PER_ACCT;
+       if ( csched_priv.ncpus <= cpu )
+           csched_priv.ncpus = cpu + 1;
+       if ( csched_priv.master >= csched_priv.ncpus )
+           csched_priv.master = cpu;
+
+       spc->cpu = cpu;
+       hrtimer_init(&spc->ticker, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+       spc->ticker.function = csched_tick_stub;
+       // hrtimer_data_pointer(&spc->ticker);
+
+       INIT_LIST_HEAD(&spc->runq);
+       spc->runq_sort_last = csched_priv.runq_sort;
+       per_cpu(schedule_data, cpu).sched_priv = spc;
+
+       spc->in_use = false;
+
+       /* Start off idling... */
+       BUG_ON(!is_idle_vcpu(per_cpu(schedule_data, cpu).curr));
+       cpumask_set_cpu(cpu, &csched_priv.idlers);
+
+       spin_unlock_irqrestore(&csched_priv.lock, flags);
+       return 0;
+}
+
+#ifndef NDEBUG
+static inline void
+__csched_vcpu_check(struct kvm_vcpu *vc)
+{
+    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
+    struct csched_vm * const sdom = svc->sdom;
+
+    BUG_ON( svc->vcpu != vc );
+    BUG_ON( sdom != CSCHED_VM(vc->kvm) );
+    if ( sdom )
+    {
+        BUG_ON( is_idle_vcpu(vc) );
+        BUG_ON( sdom->vm != vc->kvm);
+    }
+    else
+    {
+        BUG_ON( !is_idle_vcpu(vc) );
+    }
+
+    CSCHED_STAT_CRANK(vcpu_check);
+}
+#define CSCHED_VCPU_CHECK(_vc)  (__csched_vcpu_check(_vc))
+#else
+#define CSCHED_VCPU_CHECK(_vc)
+#endif
+
+static inline int
+__csched_vcpu_is_migrateable(struct kvm_vcpu *vc, int dest_cpu)
+{
+    /*
+     * Don't pick up work that's in the peer's scheduling tail. Also only pick
+     * up work that's allowed to run on our CPU.
+     */
+
+    if(!per_cpu(schedule_data, raw_smp_processor_id()).can_migrate) return 0;
+    return !vc->is_running && cpu_isset(dest_cpu, vc->cpu_affinity);
+}
+
+static int
+csched_cpu_pick(struct kvm_vcpu *vc)
+{
+    cpumask_t cpus;
+    cpumask_t idlers;
+    int cpu;
+
+    /*
+     * Pick from online CPUs in VCPU's affinity mask, giving a
+     * preference to its current processor if it's in there.
+     */
+    cpus_and(cpus, cpu_online_map, vc->cpu_affinity);
+    cpu = cpu_isset(vc->processor, cpus)
+            ? vc->processor
+            : __cycle_cpu(vc->processor, &cpus);
+    ASSERT( !cpus_empty(cpus) && cpu_isset(cpu, cpus) );
+
+    /*
+     * Try to find an idle processor within the above constraints.
+     *
+     * In multi-core and multi-threaded CPUs, not all idle execution
+     * vehicles are equal!
+     *
+     * We give preference to the idle execution vehicle with the most
+     * idling neighbours in its grouping. This distributes work across
+     * distinct cores first and guarantees we don't do something stupid
+     * like run two VCPUs on co-hyperthreads while there are idle cores
+     * or sockets.
+     */
+    idlers = csched_priv.idlers;
+    cpu_set(cpu, idlers);
+    cpus_and(cpus, cpus, idlers);
+    cpu_clear(cpu, cpus);
+
+    while ( !cpus_empty(cpus) )
+    {
+        cpumask_t cpu_idlers;
+        cpumask_t nxt_idlers;
+        int nxt;
+
+        nxt = __cycle_cpu(cpu, &cpus);
+
+        if ( cpu_isset(cpu, per_cpu(cpu_core_map,nxt)) )
+        {
+            ASSERT( cpu_isset(nxt, per_cpu(cpu_core_map,cpu)) );
+            cpus_and(cpu_idlers, idlers, per_cpu(cpu_sibling_map,cpu));
+            cpus_and(nxt_idlers, idlers, per_cpu(cpu_sibling_map,nxt));
+        }
+        else
+        {
+            ASSERT( !cpu_isset(nxt, per_cpu(cpu_core_map,cpu)) );
+            cpus_and(cpu_idlers, idlers, per_cpu(cpu_core_map,cpu));
+            cpus_and(nxt_idlers, idlers, per_cpu(cpu_core_map,nxt));
+        }
+
+        if ( cpus_weight(cpu_idlers) < cpus_weight(nxt_idlers) )
+        {
+            cpu = nxt;
+            cpu_clear(cpu, cpus);
+        }
+        else
+        {
+            cpus_andnot(cpus, cpus, nxt_idlers);
+        }
+    }
+
+    return cpu;
+}
+
+static inline void
+__csched_vcpu_acct_start(struct csched_vcpu *svc)
+{
+    struct csched_vm * const sdom = svc->sdom;
+    unsigned long flags;
+
+    spin_lock_irqsave(&csched_priv.lock, flags);
+
+    if ( list_empty(&svc->active_vcpu_elem) )
+    {
+        CSCHED_VCPU_STAT_CRANK(svc, state_active);
+        CSCHED_STAT_CRANK(acct_vcpu_active);
+
+        sdom->active_vcpu_count++;
+        list_add(&svc->active_vcpu_elem, &sdom->active_vcpu);
+        if ( list_empty(&sdom->active_sdom_elem) )
+        {
+            list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
+            csched_priv.weight += sdom->weight;
+        }
+    }
+
+    spin_unlock_irqrestore(&csched_priv.lock, flags);
+}
+
+static inline void
+__csched_vcpu_acct_stop_locked(struct csched_vcpu *svc)
+{
+    struct csched_vm * const sdom = svc->sdom;
+
+    BUG_ON( list_empty(&svc->active_vcpu_elem) );
+
+    CSCHED_VCPU_STAT_CRANK(svc, state_idle);
+    CSCHED_STAT_CRANK(acct_vcpu_idle);
+
+    sdom->active_vcpu_count--;
+    list_del_init(&svc->active_vcpu_elem);
+    if ( list_empty(&sdom->active_vcpu) )
+    {
+        BUG_ON( csched_priv.weight < sdom->weight );
+        list_del_init(&sdom->active_sdom_elem);
+        csched_priv.weight -= sdom->weight;
+    }
+}
+
+static void
+csched_vcpu_acct(unsigned int cpu)
+{
+    struct csched_vcpu * const svc = CSCHED_VCPU(current_vcpu);
+
+    if ( current_vcpu->processor != cpu ) {
+       if(is_host_vcpu(current_vcpu)) printk("host_vcpu runing ");
+       if(is_idle_vcpu(current_vcpu)) printk("idle_vcpu runing ");
+       printk(" on cpu %d me is cpu %d\n", current_vcpu->processor, cpu);
+       BUG_ON(1);
+    }
+    ASSERT( svc->sdom != NULL );
+
+    /*
+     * If this VCPU's priority was boosted when it last awoke, reset it.
+     * If the VCPU is found here, then it's consuming a non-negligeable
+     * amount of CPU resources and should no longer be boosted.
+     */
+    if ( svc->pri == CSCHED_PRI_TS_BOOST )
+        svc->pri = CSCHED_PRI_TS_UNDER;
+
+    /*
+     * Update credits
+     */
+    atomic_sub(CSCHED_CREDITS_PER_TICK, &svc->credit);
+
+    /*
+     * Put this VCPU and domain back on the active list if it was
+     * idling.
+     *
+     * If it's been active a while, check if we'd be better off
+     * migrating it to run elsewhere (see multi-core and multi-thread
+     * support in csched_cpu_pick()).
+     */
+    if ( list_empty(&svc->active_vcpu_elem) )
+    {
+        __csched_vcpu_acct_start(svc);
+    }
+    else if ( csched_cpu_pick(current_vcpu) != cpu )
+    {
+        CSCHED_VCPU_STAT_CRANK(svc, migrate_r);
+        CSCHED_STAT_CRANK(migrate_running);
+        set_bit(_VPF_migrating, &current_vcpu->pause_flags);
+       kvm_try_tasklet_schedule(NULL);
+    }
+}
+
+static int
+csched_vcpu_init(struct kvm_vcpu *vc)
+{
+       struct kvm* const kvm= vc->kvm;
+       struct csched_vm *sdom = CSCHED_VM(kvm);
+       struct csched_vcpu *svc;
+
+
+       CSCHED_STAT_CRANK(vcpu_init);
+
+       /* Allocate per-VCPU info */
+       svc = kmalloc(sizeof(struct csched_vcpu), GFP_KERNEL);
+       if ( svc == NULL )
+           return -1;
+
+       INIT_LIST_HEAD(&svc->runq_elem);
+       INIT_LIST_HEAD(&svc->active_vcpu_elem);
+       svc->sdom = sdom;
+       svc->vcpu = vc;
+       atomic_set(&svc->credit, 0);
+       svc->flags = 0U;
+       svc->pri = is_idle_vm(kvm) ? CSCHED_PRI_IDLE : CSCHED_PRI_TS_UNDER;
+       CSCHED_VCPU_STATS_RESET(svc);
+       vc->sched_priv = svc;
+
+       /* Allocate per-PCPU info */
+       if ( unlikely(!CSCHED_PCPU(vc->processor)) )
+       {
+           if ( csched_pcpu_init(vc->processor) != 0 )
+               return -1;
+       }
+
+       CSCHED_VCPU_CHECK(vc);
+       return 0;
+}
+
+static void
+csched_vcpu_destroy(struct kvm_vcpu *vc)
+{
+    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
+    struct csched_vm * const sdom = svc->sdom;
+    unsigned long flags;
+
+    CSCHED_STAT_CRANK(vcpu_destroy);
+
+    if(is_idle_vcpu(vc) || is_host_vcpu(vc)) {
+       kfree(svc);
+       return;
+    }
+
+    BUG_ON( sdom == NULL );
+    BUG_ON( !list_empty(&svc->runq_elem) );
+
+    spin_lock_irqsave(&csched_priv.lock, flags);
+
+    if ( !list_empty(&svc->active_vcpu_elem) )
+        __csched_vcpu_acct_stop_locked(svc);
+
+    spin_unlock_irqrestore(&csched_priv.lock, flags);
+
+    kfree(svc);
+}
+
+static void
+csched_vcpu_sleep(struct kvm_vcpu *vc)
+{
+    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
+    int my_cpu = get_cpu();
+
+    CSCHED_STAT_CRANK(vcpu_sleep);
+
+    BUG_ON( is_idle_vcpu(vc) );
+
+    if ( per_cpu(schedule_data, vc->processor).curr == vc ){
+       if(vc->processor== my_cpu)
+           kvm_try_tasklet_schedule(NULL);
+       else{
+           int r;
+           r = insert_pending_ipi(my_cpu, cpumask_of_cpu(vc->processor),
kvm_try_tasklet_schedule, NULL, 1);
+           if(r) {
+               dump_traces(NULL);
+               BUG_ON(1);
+           }
+           wake_up(&per_cpu(schedule_data, my_cpu).ipi_wq);
+       }
+    }
+    else if ( __vcpu_on_runq(svc) )
+        __runq_remove(svc);
+    put_cpu();
+}
+
+static void
+csched_vcpu_wake(struct kvm_vcpu *vc)
+{
+    struct csched_vcpu * const svc = CSCHED_VCPU(vc);
+    const unsigned int cpu = vc->processor;
+    BUG_ON( is_idle_vcpu(vc) );
+    TRACE_2D(TRC_CSCHED_WAKE, __LINE__, (unsigned long)vc);
+
+    if ( unlikely(per_cpu(schedule_data, cpu).curr == vc) )
+    {
+        CSCHED_STAT_CRANK(vcpu_wake_running);
+        return;
+    }
+    if ( unlikely(__vcpu_on_runq(svc)) )
+    {
+        CSCHED_STAT_CRANK(vcpu_wake_onrunq);
+        return;
+    }
+
+    if ( likely(vcpu_runnable(vc)) )
+        CSCHED_STAT_CRANK(vcpu_wake_runnable);
+    else
+        CSCHED_STAT_CRANK(vcpu_wake_not_runnable);
+
+    /*
+     * We temporarly boost the priority of awaking VCPUs!
+     *
+     * If this VCPU consumes a non negligeable amount of CPU, it
+     * will eventually find itself in the credit accounting code
+     * path where its priority will be reset to normal.
+     *
+     * If on the other hand the VCPU consumes little CPU and is
+     * blocking and awoken a lot (doing I/O for example), its
+     * priority will remain boosted, optimizing it's wake-to-run
+     * latencies.
+     *
+     * This allows wake-to-run latency sensitive VCPUs to preempt
+     * more CPU resource intensive VCPUs without impacting overall
+     * system fairness.
+     *
+     * The one exception is for VCPUs of capped domains unpausing
+     * after earning credits they had overspent. We don't boost
+     * those.
+     */
+    if ( svc->pri == CSCHED_PRI_TS_UNDER &&
+         !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
+    {
+        svc->pri = CSCHED_PRI_TS_BOOST;
+    }
+
+    /* Put the VCPU on the runq and tickle CPUs */
+    TRACE_2D(TRC_CSCHED_WAKE, __LINE__, vc);
+    __runq_insert(cpu, svc);
+    TRACE_2D(TRC_CSCHED_WAKE, __LINE__, vc);
+    __runq_tickle(cpu, svc);
+    TRACE_2D(TRC_CSCHED_WAKE, __LINE__, vc);
+}
+static void get_param(char* param, char **endp, int *weight, int *cap)
+{
+    char *opt_1;
+    char *opt_2;
+    /* skip heading spaces */
+    while((*param == ' ') || (*param == '\t') || (*param == '\n'))
+       param++;
+
+    opt_1 = strchr(param, '=');
+    if (opt_1 != NULL){
+       char *end = opt_1-1;
+       *opt_1++ = '\0';
+       while((*end == ' ') || (*end == '\t') || (*end == '\n'))
+           *end-- = '\0';
+    }
+    if(strcmp(param, "weight") == 0)
+       *weight = simple_strtol(opt_1, endp, 0);
+    else if(strcmp(param, "cap") == 0)
+       *cap = simple_strtol(opt_1, endp, 0);
+}
+static int csched_read_param(struct kvm *kvm, char *buf, int size)
+{
+    struct csched_vm * const sdom = CSCHED_VM(kvm);
+    unsigned long flags;
+    int r;
+
+    r = snprintf(buf, size, "Credit Schedule parameters for VM %d\n"
+           "Weight: %d     Cap: %d\n", kvm->vmid, sdom->weight, sdom->cap);
+    if (r < 0) printk("the buf for sched_read_param is too small!\n");
+    return (r>0);
+}
+static int csched_write_param(struct kvm *kvm, char *param)
+{
+    char *tmp;
+    int weight = 256;
+    int cap = 0;
+    struct csched_vm * const sdom = CSCHED_VM(kvm);
+    unsigned long flags;
+
+    get_param(param, &tmp, &weight, &cap);
+    get_param(tmp, &tmp, &weight, &cap);
+    spin_lock_irqsave(&csched_priv.lock, flags);
+
+    if ( !list_empty(&sdom->active_sdom_elem) )
+    {
+       csched_priv.weight -= sdom->weight;
+       csched_priv.weight += weight;
+    }
+    sdom->weight = weight;
+    sdom->cap = cap;
+    spin_unlock_irqrestore(&csched_priv.lock, flags);
+    printk("the weight is %d cap is %d\n", weight, cap);
+    return 1;
+}
+
+static int
+csched_vm_init(struct kvm *kvm)
+{
+       struct csched_vm *sdom;
+
+       CSCHED_STAT_CRANK(dom_init);
+       if ( is_idle_vm(kvm) )
+               return 0;
+
+       sdom = kmalloc(sizeof(struct csched_vm), GFP_KERNEL);
+       if ( sdom == NULL )
+               return -ENOMEM;
+
+       /* Initialize credit and weight */
+       INIT_LIST_HEAD(&sdom->active_vcpu);
+       sdom->active_vcpu_count = 0;
+       INIT_LIST_HEAD(&sdom->active_sdom_elem);
+       sdom->vm = kvm;
+       sdom->weight = CSCHED_DEFAULT_WEIGHT;
+       sdom->cap = 0U;
+       kvm->sched_priv = sdom;
+
+       return 0;
+}
+
+static void
+csched_vm_destroy(struct kvm *kvm)
+{
+    CSCHED_STAT_CRANK(dom_destroy);
+    kfree(CSCHED_VM(kvm));
+}
+
+/*
+ * This is a O(n) optimized sort of the runq.
+ *
+ * Time-share VCPUs can only be one of two priorities, UNDER or OVER. We walk
+ * through the runq and move up any UNDERs that are preceded by OVERS. We
+ * remember the last UNDER to make the move up operation O(1).
+ */
+static void
+csched_runq_sort(unsigned int cpu)
+{
+    struct csched_pcpu * const spc = CSCHED_PCPU(cpu);
+    struct list_head *runq, *elem, *next, *last_under;
+    struct csched_vcpu *svc_elem;
+    unsigned long flags;
+    int sort_epoch;
+
+    sort_epoch = csched_priv.runq_sort;
+    if ( sort_epoch == spc->runq_sort_last )
+        return;
+
+    spc->runq_sort_last = sort_epoch;
+
+    spin_lock_irqsave(&per_cpu(schedule_data, cpu).schedule_lock, flags);
+
+    runq = &spc->runq;
+    elem = runq->next;
+    last_under = runq;
+
+    while ( elem != runq )
+    {
+        next = elem->next;
+        svc_elem = __runq_elem(elem);
+
+        if ( svc_elem->pri >= CSCHED_PRI_TS_UNDER )
+        {
+            /* does elem need to move up the runq? */
+            if ( elem->prev != last_under )
+            {
+                list_del(elem);
+                list_add(elem, last_under);
+            }
+            last_under = elem;
+        }
+
+        elem = next;
+    }
+    spin_unlock_irqrestore(&per_cpu(schedule_data, cpu).schedule_lock, flags);
+}
+
+static void
+csched_acct(void)
+{
+    unsigned long flags;
+    struct list_head *iter_vcpu, *next_vcpu;
+    struct list_head *iter_sdom, *next_sdom;
+    struct csched_vcpu *svc;
+    struct csched_vm *sdom;
+    uint32_t credit_total;
+    uint32_t weight_total;
+    uint32_t weight_left;
+    uint32_t credit_fair;
+    uint32_t credit_peak;
+    uint32_t credit_cap;
+    int credit_balance;
+    int credit_xtra;
+    int credit;
+
+
+    spin_lock_irqsave(&csched_priv.lock, flags);
+
+    weight_total = csched_priv.weight;
+    credit_total = csched_priv.credit;
+
+    /* Converge balance towards 0 when it drops negative */
+    if ( csched_priv.credit_balance < 0 )
+    {
+        credit_total -= csched_priv.credit_balance;
+        CSCHED_STAT_CRANK(acct_balance);
+    }
+
+    if ( unlikely(weight_total == 0) )
+    {
+        csched_priv.credit_balance = 0;
+        spin_unlock_irqrestore(&csched_priv.lock, flags);
+        CSCHED_STAT_CRANK(acct_no_work);
+        return;
+    }
+
+    CSCHED_STAT_CRANK(acct_run);
+
+    weight_left = weight_total;
+    credit_balance = 0;
+    credit_xtra = 0;
+    credit_cap = 0U;
+
+    list_for_each_safe( iter_sdom, next_sdom, &csched_priv.active_sdom )
+    {
+        sdom = list_entry(iter_sdom, struct csched_vm, active_sdom_elem);
+
+        BUG_ON( is_idle_vm(sdom->vm) );
+        BUG_ON( sdom->active_vcpu_count == 0 );
+        BUG_ON( sdom->weight == 0 );
+        BUG_ON( sdom->weight > weight_left );
+
+        weight_left -= sdom->weight;
+
+        /*
+         * A domain's fair share is computed using its weight in competition
+         * with that of all other active domains.
+         *
+         * At most, a domain can use credits to run all its active VCPUs
+         * for one full accounting period. We allow a domain to earn more
+         * only when the system-wide credit balance is negative.
+         */
+        credit_peak = sdom->active_vcpu_count * CSCHED_CREDITS_PER_ACCT;
+        if ( csched_priv.credit_balance < 0 )
+        {
+            credit_peak += ( ( -csched_priv.credit_balance * sdom->weight) +
+                             (weight_total - 1)
+                           ) / weight_total;
+        }
+
+        if ( sdom->cap != 0U )
+        {
+            credit_cap = ((sdom->cap * CSCHED_CREDITS_PER_ACCT) + 99) / 100;
+            if ( credit_cap < credit_peak )
+                credit_peak = credit_cap;
+
+            credit_cap = ( credit_cap + ( sdom->active_vcpu_count - 1 )
+                         ) / sdom->active_vcpu_count;
+        }
+
+        credit_fair = ( ( credit_total * sdom->weight) + (weight_total - 1)
+                      ) / weight_total;
+
+        if ( credit_fair < credit_peak )
+        {
+            credit_xtra = 1;
+        }
+        else
+        {
+            if ( weight_left != 0U )
+            {
+                /* Give other domains a chance at unused credits */
+                credit_total += ( ( ( credit_fair - credit_peak
+                                    ) * weight_total
+                                  ) + ( weight_left - 1 )
+                                ) / weight_left;
+            }
+
+            if ( credit_xtra )
+            {
+                /*
+                 * Lazily keep domains with extra credits at the head of
+                 * the queue to give others a chance at them in future
+                 * accounting periods.
+                 */
+                CSCHED_STAT_CRANK(acct_reorder);
+                list_del(&sdom->active_sdom_elem);
+                list_add(&sdom->active_sdom_elem, &csched_priv.active_sdom);
+            }
+
+            credit_fair = credit_peak;
+        }
+
+        /* Compute fair share per VCPU */
+        credit_fair = ( credit_fair + ( sdom->active_vcpu_count - 1 )
+                      ) / sdom->active_vcpu_count;
+
+
+        list_for_each_safe( iter_vcpu, next_vcpu, &sdom->active_vcpu )
+        {
+            svc = list_entry(iter_vcpu, struct csched_vcpu, active_vcpu_elem);
+            BUG_ON( sdom != svc->sdom );
+
+            /* Increment credit */
+            atomic_add(credit_fair, &svc->credit);
+            credit = atomic_read(&svc->credit);
+
+            /*
+             * Recompute priority or, if VCPU is idling, remove it from
+             * the active list.
+             */
+            if ( credit < 0 )
+            {
+                svc->pri = CSCHED_PRI_TS_OVER;
+
+                /* Park running VCPUs of capped-out domains */
+                if ( sdom->cap != 0U &&
+                     credit < -credit_cap &&
+                     !(svc->flags & CSCHED_FLAG_VCPU_PARKED) )
+                {
+                    CSCHED_STAT_CRANK(vcpu_park);
+                    vcpu_pause_nosync(svc->vcpu);
+                    svc->flags |= CSCHED_FLAG_VCPU_PARKED;
+                }
+
+                /* Lower bound on credits */
+                if ( credit < -CSCHED_CREDITS_PER_TSLICE )
+                {
+                    CSCHED_STAT_CRANK(acct_min_credit);
+                    credit = -CSCHED_CREDITS_PER_TSLICE;
+                    atomic_set(&svc->credit, credit);
+                }
+            }
+            else
+            {
+                svc->pri = CSCHED_PRI_TS_UNDER;
+
+                /* Unpark any capped domains whose credits go positive */
+                if ( svc->flags & CSCHED_FLAG_VCPU_PARKED)
+                {
+                    /*
+                     * It's important to unset the flag AFTER the unpause()
+                     * call to make sure the VCPU's priority is not boosted
+                     * if it is woken up here.
+                     */
+                    CSCHED_STAT_CRANK(vcpu_unpark);
+                    vcpu_unpause(svc->vcpu);
+                    svc->flags &= ~CSCHED_FLAG_VCPU_PARKED;
+                }
+
+                /* Upper bound on credits means VCPU stops earning */
+                if ( credit > CSCHED_CREDITS_PER_TSLICE )
+                {
+                    __csched_vcpu_acct_stop_locked(svc);
+                    credit = 0;
+                    atomic_set(&svc->credit, credit);
+                }
+            }
+
+            CSCHED_VCPU_STAT_SET(svc, credit_last, credit);
+            CSCHED_VCPU_STAT_SET(svc, credit_incr, credit_fair);
+            credit_balance += credit;
+        }
+    }
+
+    csched_priv.credit_balance = credit_balance;
+
+    spin_unlock_irqrestore(&csched_priv.lock, flags);
+
+    /* Inform each CPU that its runq needs to be sorted */
+    csched_priv.runq_sort++;
+}
+
+/* the return value indicates whether this routine has enabled the pseudo_irq
+ * Return Value:
+ *     0
+ */
+static void _csched_tick(unsigned long _unused)
+{
+       int my_cpu = get_cpu();
+       struct csched_pcpu *spc = CSCHED_PCPU(my_cpu);
+       unsigned int cpu = spc->cpu;
+       ktime_t now;
+
+       /* if we fail to lock, it indicates the KVM modules are going to
+        * be unloaded
+        */
+       if(cmpxchg(&spc->in_use, false, true) != false) {
+           put_cpu();
+           return;
+       }
+
+       BUG_ON(thread_preemptible());
+       BUG_ON(cpu != my_cpu);
+
+       TRACE_2D(TRC_INTERNAL_CSCHED_TICK, my_cpu, __LINE__);
+       // tasklet_disable(&per_cpu(schedule_data, my_cpu).sched_tasklet);
+try_again:
+       while(per_cpu(schedule_data, my_cpu).sched_state == SCHEDULER_USER)
+           ;
+       if(per_cpu(schedule_data, my_cpu).sched_state == SCHEDULER_KERNEL) {
+           /* this indicates that someone else has entered the
critical region now
+            * we should not continue
+            */
+           TRACE_2D(TRC_INTERNAL_CSCHED_TICK, my_cpu, __LINE__);
+           if(!shutting_down) {
+               hrtimer_cancel(&spc->ticker);
+               now = spc->ticker.base->get_time();
+               hrtimer_start(&spc->ticker,
+                       ktime_add_ns(now, 10000),
+                       HRTIMER_MODE_ABS);
+           }
+           // tasklet_enable(&per_cpu(schedule_data, my_cpu).sched_tasklet);
+           spc->in_use = false;
+           put_cpu();
+           return;
+       }
+       if (cmpxchg(&per_cpu(schedule_data, my_cpu).sched_state,
+                   SCHEDULER_FREE, SCHEDULER_KERNEL) != SCHEDULER_FREE)
+           goto try_again;
+
+       TRACE_2D(TRC_INTERNAL_CSCHED_TICK, my_cpu, __LINE__);
+
+       spc->tick++;
+
+       /*
+        * Accounting for running VCPU
+        */
+       if ( !is_idle_vcpu(current_vcpu) )
+           csched_vcpu_acct(cpu);
+
+       TRACE_2D(TRC_INTERNAL_CSCHED_TICK, my_cpu, __LINE__);
+       /*
+        * Host-wide accounting duty
+        *
+        * Note: Currently, this is always done by the master boot
CPU. Eventually,
+        * we could distribute or at the very least cycle the duty.
+        */
+       if ( (csched_priv.master == cpu) &&
+            (spc->tick % CSCHED_TICKS_PER_ACCT) == 0 )
+       {
+           csched_acct();
+       }
+
+       TRACE_2D(TRC_INTERNAL_CSCHED_TICK, my_cpu, __LINE__);
+       /*
+        * Check if runq needs to be sorted
+        *
+        * Every physical CPU resorts the runq after the accounting master has
+        * modified priorities. This is a special O(n) sort and runs at most
+        * once per accounting period (currently 30 milliseconds).
+        */
+       csched_runq_sort(cpu);
+       TRACE_2D(TRC_INTERNAL_CSCHED_TICK, my_cpu, __LINE__);
+
+       if (!shutting_down) {
+           now = spc->ticker.base->get_time();
+           hrtimer_start(&spc->ticker,
+                   ktime_add_ns(now, MILLISECS(CSCHED_MSECS_PER_TICK)),
+                   HRTIMER_MODE_ABS);
+       }
+       per_cpu(schedule_data, my_cpu).sched_state = SCHEDULER_FREE;
+       // tasklet_enable(&per_cpu(schedule_data, my_cpu).sched_tasklet);
+       TRACE_2D(TRC_INTERNAL_CSCHED_TICK, my_cpu, __LINE__);
+       spc->in_use = false;
+       put_cpu();
+}
+
+/* NOTICE:
+ *   Linux can not guarrentee the timer set on one cpu will be destined
+ *   to be triggered on that cpu. That is to say: one cpu might receive
+ *   other cpu's timer expire message. In this case, the message should
+ *   be relayed to the corresponding cpu. This is done via an IPI-tasklet
+ *                                                       --Alex
+ */
+static enum hrtimer_restart csched_tick_stub(struct hrtimer *data)
+{
+       u64 delta;
+       int cur_cpu = get_cpu();
+       struct csched_pcpu *spc = container_of(data, struct
csched_pcpu, ticker);
+       BUG_ON(!data);
+       BUG_ON(!spc);
+
+       /* if the ticker does not belong to this cpu, relay it */
+       if(spc->cpu != cur_cpu){
+           int r;
+           r = insert_pending_ipi(cur_cpu, cpumask_of_cpu(spc->cpu),
csched_tick_stub, (void*)data, 1);
+           if (r) {
+               dump_traces(NULL);
+               BUG_ON(1);
+           }
+           TRACE_3D(TRC_CSCHED_TICK,cur_cpu, spc->cpu, __LINE__);
+           wake_up(&per_cpu(schedule_data, cur_cpu).ipi_wq);
+       }else {
+           TRACE_3D(TRC_CSCHED_TICK,cur_cpu, spc->cpu, __LINE__);
+           tasklet_schedule(&per_cpu(schedule_data, cur_cpu).tick_tasklet);
+       }
+
+       /* the timer will be restarted by the tick_tasklet */
+       /* to add hrtimer_cancel() here!*/
+       put_cpu();
+       TRACE_2D(TRC_CSCHED_TICK, cur_cpu, __LINE__);
+       return HRTIMER_NORESTART;
+}
+
+static struct csched_vcpu *
+csched_runq_steal(int peer_cpu, int cpu, int pri)
+{
+    const struct csched_pcpu * const peer_pcpu = CSCHED_PCPU(peer_cpu);
+    const struct kvm_vcpu * const peer_vcpu = per_cpu(schedule_data,
peer_cpu).curr;
+    struct csched_vcpu *speer;
+    struct list_head *iter;
+    struct kvm_vcpu *vc;
+
+    /*
+     * Don't steal from an idle CPU's runq because it's about to
+     * pick up work from it itself.
+     */
+    if ( peer_pcpu != NULL && !is_idle_vcpu(peer_vcpu) )
+    {
+        list_for_each( iter, &peer_pcpu->runq )
+        {
+            speer = __runq_elem(iter);
+
+            /*
+             * If next available VCPU here is not of strictly higher
+             * priority than ours, this PCPU is useless to us.
+             */
+            if ( speer->pri <= pri )
+                break;
+
+            /* Is this VCPU is runnable on our PCPU? */
+            vc = speer->vcpu;
+            BUG_ON( is_idle_vcpu(vc) );
+
+            if (__csched_vcpu_is_migrateable(vc, cpu))
+            {
+                /* We got a candidate. Grab it! */
+                CSCHED_VCPU_STAT_CRANK(speer, migrate_q);
+                CSCHED_STAT_CRANK(migrate_queued);
+                __runq_remove(speer);
+               printk("setting thread %d's cpuaffinity to %d!\n",
+                       vc->thread->pid, cpu);
+               kvm_sched_setaffinity(vc->thread->pid, cpumask_of_cpu(cpu));
+                vc->processor= cpu;
+                return speer;
+            }
+        }
+    }
+
+    CSCHED_STAT_CRANK(steal_peer_idle);
+    return NULL;
+}
+
+static struct csched_vcpu *
+csched_load_balance(int cpu, struct csched_vcpu *snext)
+{
+    struct csched_vcpu *speer;
+    cpumask_t workers;
+    int peer_cpu;
+
+    BUG_ON( cpu != snext->vcpu->processor);
+
+    if ( snext->pri == CSCHED_PRI_IDLE )
+        CSCHED_STAT_CRANK(load_balance_idle);
+    else if ( snext->pri == CSCHED_PRI_TS_OVER )
+        CSCHED_STAT_CRANK(load_balance_over);
+    else
+        CSCHED_STAT_CRANK(load_balance_other);
+
+    /*
+     * Peek at non-idling CPUs in the system, starting with our
+     * immediate neighbour.
+     */
+    cpus_andnot(workers, cpu_online_map, csched_priv.idlers);
+    cpu_clear(cpu, workers);
+    peer_cpu = cpu;
+
+    while ( !cpus_empty(workers) )
+    {
+        peer_cpu = __cycle_cpu(peer_cpu, &workers);
+        cpu_clear(peer_cpu, workers);
+
+        /*
+         * Get ahold of the scheduler lock for this peer CPU.
+         *
+         * Note: We don't spin on this lock but simply try it. Spinning could
+         * cause a deadlock if the peer CPU is also load balancing and trying
+         * to lock this CPU.
+         */
+        if ( !spin_trylock(&per_cpu(schedule_data, peer_cpu).schedule_lock) )
+        {
+            CSCHED_STAT_CRANK(steal_trylock_failed);
+            continue;
+        }
+       TRACE_3D(TRC_LOAD_BALANCE, peer_cpu, raw_smp_processor_id(), __LINE__);
+
+
+        /*
+         * Any work over there to steal?
+         */
+        speer = csched_runq_steal(peer_cpu, cpu, snext->pri);
+        spin_unlock(&per_cpu(schedule_data, peer_cpu).schedule_lock);
+       TRACE_3D(TRC_LOAD_BALANCE, cpu, peer_cpu, __LINE__);
+        if ( speer != NULL )
+            return speer;
+    }
+
+       TRACE_3D(TRC_LOAD_BALANCE, cpu, peer_cpu, __LINE__);
+    /* Failed to find more important work elsewhere... */
+    __runq_remove(snext);
+       TRACE_3D(TRC_LOAD_BALANCE, cpu, peer_cpu, __LINE__);
+    return snext;
+}
+
+/*
+ * This function is in the critical path. It is designed to be simple and
+ * fast for the common case.
+ */
+static struct task_slice
+csched_schedule(s_time_t now)
+{
+    const int cpu = smp_processor_id();
+    struct list_head * const runq = RUNQ(cpu);
+    struct csched_vcpu * const scurr = CSCHED_VCPU(current_vcpu);
+    struct csched_vcpu *snext;
+    struct task_slice ret;
+
+    if(shutting_down)  {
+       ret.time = MILLISECS(CSCHED_MSECS_PER_TSLICE);
+       ret.task = idle_vm_kvm->vcpus[cpu];
+       return ret;
+    }
+
+    CSCHED_STAT_CRANK(schedule);
+    CSCHED_VCPU_CHECK(current_vcpu);
+
+    /*
+     * Select next runnable local VCPU (ie top of local runq)
+     */
+    if ( vcpu_runnable(current_vcpu) )
+        __runq_insert(cpu, scurr);
+    else
+        BUG_ON( is_idle_vcpu(current_vcpu) || list_empty(runq) );
+
+    snext = __runq_elem(runq->next);
+
+    /*
+     * SMP Load balance:
+     *
+     * If the next highest priority local runnable VCPU has already eaten
+     * through its credits, look on other PCPUs to see if we have more
+     * urgent work... If not, csched_load_balance() will return snext, but
+     * already removed from the runq.
+     */
+    if ( snext->pri > CSCHED_PRI_TS_OVER )
+        __runq_remove(snext);
+    else
+        snext = csched_load_balance(cpu, snext);
+    TRACE_3D(TRC_CSCHED_SCHEDULE, cpu, snext, __LINE__);
+
+    /*
+     * Update idlers mask if necessary. When we're idling, other CPUs
+     * will tickle us when they get extra work.
+     */
+    if ( snext->pri == CSCHED_PRI_IDLE )
+    {
+        if ( !cpu_isset(cpu, csched_priv.idlers) )
+            cpu_set(cpu, csched_priv.idlers);
+    }
+    else if ( cpu_isset(cpu, csched_priv.idlers) )
+    {
+        cpu_clear(cpu, csched_priv.idlers);
+    }
+
+    /*
+     * Return task to run next...
+     */
+    ret.time = MILLISECS(CSCHED_MSECS_PER_TSLICE);
+    ret.task = snext->vcpu;
+
+    CSCHED_VCPU_CHECK(ret.task);
+    return ret;
+}
+
+static void
+csched_dump_vcpu(struct csched_vcpu *svc)
+{
+    printk("func %s unimplemented\n", __FUNCTION__);
+}
+
+static void
+csched_dump_pcpu(int cpu)
+{
+    struct list_head *runq, *iter;
+    struct csched_pcpu *spc;
+    struct csched_vcpu *svc;
+    int loop;
+
+    spc = CSCHED_PCPU(cpu);
+    runq = &spc->runq;
+
+    printk(" sort=%d, sibling=0x%lx, core=0x%lx\n",
+            spc->runq_sort_last,
+            per_cpu(cpu_sibling_map,cpu).bits[0],
+            per_cpu(cpu_core_map,cpu).bits[0]);
+
+    /* current VCPU */
+    svc = CSCHED_VCPU(per_cpu(schedule_data, cpu).curr);
+    if ( svc )
+    {
+        printk("\trun: ");
+        csched_dump_vcpu(svc);
+    }
+
+    loop = 0;
+    list_for_each( iter, runq )
+    {
+        svc = __runq_elem(iter);
+        if ( svc )
+        {
+            printk("\t%3d: ", ++loop);
+            csched_dump_vcpu(svc);
+        }
+    }
+}
+
+static void
+csched_dump(void)
+{
+    struct list_head *iter_sdom, *iter_svc;
+    int loop;
+
+    printk("info:\n"
+           "\tncpus              = %u\n"
+           "\tmaster             = %u\n"
+           "\tcredit             = %u\n"
+           "\tcredit balance     = %d\n"
+           "\tweight             = %u\n"
+           "\trunq_sort          = %u\n"
+           "\tdefault-weight     = %d\n"
+           "\tmsecs per tick     = %dms\n"
+           "\tcredits per tick   = %d\n"
+           "\tticks per tslice   = %d\n"
+           "\tticks per acct     = %d\n",
+           csched_priv.ncpus,
+           csched_priv.master,
+           csched_priv.credit,
+           csched_priv.credit_balance,
+           csched_priv.weight,
+           csched_priv.runq_sort,
+           CSCHED_DEFAULT_WEIGHT,
+           CSCHED_MSECS_PER_TICK,
+           CSCHED_CREDITS_PER_TICK,
+           CSCHED_TICKS_PER_TSLICE,
+           CSCHED_TICKS_PER_ACCT);
+
+    printk("idlers: 0x%lx\n", csched_priv.idlers.bits[0]);
+
+    CSCHED_STATS_PRINTK();
+
+    printk("active vcpus:\n");
+    loop = 0;
+    list_for_each( iter_sdom, &csched_priv.active_sdom )
+    {
+        struct csched_vm *sdom;
+        sdom = list_entry(iter_sdom, struct csched_vm, active_sdom_elem);
+
+        list_for_each( iter_svc, &sdom->active_vcpu )
+        {
+            struct csched_vcpu *svc;
+            svc = list_entry(iter_svc, struct csched_vcpu, active_vcpu_elem);
+
+            printk("\t%3d: ", ++loop);
+            csched_dump_vcpu(svc);
+        }
+    }
+}
+static void csched_stop_schedule(int cpu)
+{
+    struct csched_pcpu *spc;
+    spc = CSCHED_PCPU(cpu);
+    if(!spc) {
+       printk("func %s line %d cpu %d\n",
+               __FUNCTION__, __LINE__, cpu);
+       BUG_ON(1);
+    }
+    hrtimer_cancel(&spc->ticker);
+}
+
+
+/* Tickers cannot be kicked until SMP subsystem is alive. */
+int csched_start(int cpu)
+{
+       struct csched_pcpu *spc;
+       ktime_t now;
+
+       /* Is the credit scheduler initialised? */
+       if ( csched_priv.ncpus == 0 ){
+               printk("(ERROR) You should not see func %s line %d\n",
+                       __FUNCTION__, __LINE__);
+               BUG_ON(1);
+               return 1;
+       }
+
+       tasklet_init(&per_cpu(schedule_data, cpu).tick_tasklet,
_csched_tick, (unsigned long)NULL);
+       spc = CSCHED_PCPU(cpu);
+       if(!spc) {
+           printk("func %s line %d cpu %d\n",
+                   __FUNCTION__, __LINE__, cpu);
+           BUG_ON(1);
+           return 1;
+       }
+
+       now = spc->ticker.base->get_time();
+
+       hrtimer_start(&spc->ticker,
+               ktime_add_ns(now, MILLISECS(CSCHED_MSECS_PER_TICK)),
+               HRTIMER_MODE_ABS);
+
+       return 0;
+}
+static void
+csched_init(void)
+{
+    spin_lock_init(&csched_priv.lock);
+    INIT_LIST_HEAD(&csched_priv.active_sdom);
+    csched_priv.ncpus = 0;
+    csched_priv.master = UINT_MAX;
+    cpus_clear(csched_priv.idlers);
+    csched_priv.weight = 0U;
+    csched_priv.credit = 0U;
+    csched_priv.credit_balance = 0;
+    csched_priv.runq_sort = 0U;
+    CSCHED_STATS_RESET();
+}
+
+#define SCHEDULER_CREDIT    1
+struct scheduler sched_credit_def = {
+    .name           = "SMP Credit Scheduler",
+    .opt_name       = "credit",
+    .sched_id       = SCHEDULER_CREDIT,
+
+    .init_vm       = csched_vm_init,
+    .destroy_vm            = csched_vm_destroy,
+
+    .init_vcpu      = csched_vcpu_init,
+    .destroy_vcpu   = csched_vcpu_destroy,
+
+    .sleep          = csched_vcpu_sleep,
+    .wake           = csched_vcpu_wake,
+
+    .read_schedule_info = csched_read_param,
+    .write_schedule_info = csched_write_param,
+
+    .pick_cpu       = csched_cpu_pick,
+    .do_schedule    = csched_schedule,
+
+    .disable_scheduler = csched_disable,
+    .start_scheduler = csched_start,
+    .stop_schedule  = csched_stop_schedule,
+
+    .dump_cpu_state = csched_dump_pcpu,
+    .dump_settings  = csched_dump,
+    .init           = csched_init,
+};
diff --git a/arch/x86/kvm/schedule.c b/arch/x86/kvm/schedule.c
new file mode 100644
index 0000000..d6938d5
--- /dev/null
+++ b/arch/x86/kvm/schedule.c
@@ -0,0 +1,959 @@
+/****************************************************************************
+ * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
+ * (C) 2002-2003 University of Cambridge
+ * (C) 2004      - Mark Williamson - Intel Research Cambridge
+ * (C) 2009      - Xiaojian Liu
+ ****************************************************************************
+ *
+ *        File: common/schedule.c
+ *      Author: Rolf Neugebauer & Keir Fraser
+ *              Updated for generic API by Mark Williamson
+ *             Updated for KVM by Xiaojian Liu
+ *
+ * Description: Generic CPU scheduling code
+ *              implements support functionality for the Xen scheduler API.
+ *
+ */
+#include <linux/kvm_host.h>
+#include <linux/spinlock.h>
+#include <linux/trace.h>
+#include <linux/sched-if.h>
+#include <linux/ipi.h>
+#include <linux/proc_fs.h>
+#include <linux/uaccess.h>
+#include <linux/kthread.h>
+
+#ifdef DEBUG
+#define ASSERT(x)                                                  \
+do {                                                               \
+       if (!(x)) {                                                \
+           printk(KERN_EMERG "assertion failed %s: %d: %s\n",     \
+                  __FILE__, __LINE__, #x);                        \
+           BUG();                                                 \
+    }                                                              \
+} while (0)
+#else
+#define ASSERT(x) do { } while (0)
+#endif
+
+#if 0
+long (*sched_setaffinity_p)(pid_t pid, cpumask_t new_mask);
+#else
+extern long (*sched_setaffinity_p)(pid_t pid, cpumask_t* in_mask);
+#endif
+// EXPORT_SYMBOL_GPL(sched_setaffinity_p);
+
+DEFINE_PER_CPU(struct schedule_data, schedule_data);
+DEFINE_PER_CPU(rwlock_t, pseudo_cli);
+
+static struct list_head sched_vm_list;
+static spinlock_t  sched_vm_list_lock;
+
+#define KVM_SCHEDULER "kvm"
+
+struct proc_dir_entry *kvm_scheduler_root;
+
+#ifndef COMPAT
+
+/* Various timer handlers. */
+static enum hrtimer_restart s_timer_fn(struct hrtimer* timer);
+static void vcpu_singleshot_timer_fn(void *data);
+static void poll_timer_fn(void *data);
+
+/* This is global for now so that private implementations can reach it */
+
+extern struct scheduler sched_credit_def;
+static struct scheduler *schedulers[] = {
+    &sched_credit_def,
+    NULL
+};
+
+static struct scheduler ops;
+
+#define SCHED_OP(fn, ...)                                 \
+         (( ops.fn != NULL ) ? ops.fn( __VA_ARGS__ )      \
+          : (typeof(ops.fn(__VA_ARGS__)))0 )
+
+void vcpu_pause(struct kvm_vcpu *v)
+{
+       ASSERT(v != current_vcpu);
+
+       atomic_inc(&v->pause_count);
+       vcpu_sleep_sync(v);
+}
+void vcpu_pause_nosync(struct kvm_vcpu *v)
+{
+       atomic_inc(&v->pause_count);
+       vcpu_sleep_nosync(v);
+}
+void vcpu_unpause(struct kvm_vcpu *v)
+{
+       if(atomic_dec_and_test(&v->pause_count))
+               vcpu_wake(v);
+}
+
+void vm_pause(struct kvm *kvm)
+{
+    int late_wait = -1;
+    int i;
+
+    for(i=0; i< KVM_MAX_VCPUS; i++) {
+       if(kvm->vcpus[i]) {
+           if(current_vcpu == kvm->vcpus[i]) {
+               atomic_inc(&kvm->vcpus[i]->pause_count);
+               late_wait = i;
+               continue;
+           }
+           vcpu_sleep_sync(kvm->vcpus[i]);
+       }
+    }
+    if (late_wait != -1) {
+       if(current_vcpu == kvm->vcpus[late_wait]) {
+           if(!is_host_vm(kvm) && !is_idle_vm(kvm)) {
+               tasklet_schedule(&per_cpu(schedule_data,
raw_smp_processor_id()).sched_tasklet);
+               while(current_vcpu->kvm == kvm) {
+                   printk("waiting\n");
+                   schedule();
+               }
+           }
+       }
+    }
+}
+
+void vm_unpause(struct kvm *kvm)
+{
+       int i;
+       struct kvm_vcpu* v;
+       int cpu = get_cpu();
+
+       if ( atomic_dec_and_test(&kvm->pause_count) ){
+               for(i = 0; i < KVM_MAX_VCPUS; ++i) {
+                       if(unlikely(i == cpu)) continue;
+                       v = kvm->vcpus[i];
+                       if (v) {
+                               printk("me is %d waking vcpu %d\n", cpu, i);
+                               vcpu_wake(v);
+                       }
+               }
+               if(kvm->vcpus[cpu]){
+                   printk("waking myself %d now\n", cpu);
+                   vcpu_wake(kvm->vcpus[cpu]);
+               }
+       }
+       put_cpu();
+}
+void vm_pause_by_systemcontroller(struct kvm* kvm)
+{
+       vm_pause(kvm);
+       if(test_and_set_bool(kvm->is_paused_by_controller))
+               vm_unpause(kvm);
+}
+
+void vm_unpause_by_systemcontroller(struct kvm* kvm)
+{
+       if(test_and_clear_bool(kvm->is_paused_by_controller))
+               vm_unpause(kvm);
+
+}
+
+static inline void vcpu_runstate_change(
+    struct kvm_vcpu *v, int new_state, s_time_t new_entry_time)
+{
+    ASSERT(v->runstate.state != new_state);
+    ASSERT(spin_is_locked(&per_cpu(schedule_data,v->processor).schedule_lock));
+
+    v->runstate.time[v->runstate.state] +=
+        new_entry_time - v->runstate.state_entry_time;
+    v->runstate.state_entry_time = new_entry_time;
+    v->runstate.state = new_state;
+}
+
+void vcpu_runstate_get(struct kvm_vcpu *v, struct vcpu_runstate_info *runstate)
+{
+    if ( likely(v == current_vcpu) )
+    {
+        /* Fast lock-free path. */
+        memcpy(runstate, &v->runstate, sizeof(*runstate));
+        ASSERT(runstate->state == RUNSTATE_running);
+        runstate->time[RUNSTATE_running] += NOW() - runstate->state_entry_time;
+    }
+    else
+    {
+       unsigned long flags;
+       get_cpu();
+       local_irq_save(flags);
+        vcpu_schedule_lock_irq(v);
+        memcpy(runstate, &v->runstate, sizeof(*runstate));
+        runstate->time[runstate->state] += NOW() - runstate->state_entry_time;
+        vcpu_schedule_unlock_irq(v);
+       local_irq_restore(flags);
+       put_cpu();
+    }
+}
+
+int sched_init_vcpu(struct kvm_vcpu *v, unsigned int processor)
+{
+       int r;
+       struct kvm *kvm = v->kvm;
+
+       /*
+        * Initialize processor and affinity settings. The idler, and
potentially
+        * domain-0 VCPUs, are pinned onto their respective physical CPUs.
+        */
+       v->processor = processor;
+       printk("sched_setaffinity_p is %p\n", sched_setaffinity_p);
+       if (!is_host_vcpu(v) && !is_idle_vcpu(v))
+           kvm_sched_setaffinity(v->thread->pid, cpumask_of_cpu(processor));
+
+       if ( is_idle_vm(kvm) || is_host_vm(kvm))
+               v->cpu_affinity = cpumask_of_cpu(processor);
+       else
+               cpus_setall(v->cpu_affinity);
+
+       /* Idle VCPUs are scheduled immediately. */
+       if ( is_idle_vm(kvm) )
+       {
+               per_cpu(schedule_data, v->processor).curr = v;
+               per_cpu(schedule_data, v->processor).idle = v;
+               v->is_running = 1;
+       }
+
+       TRACE_1D(TRC_SCHED_VM_ADD, v->vcpu_id);
+       r = SCHED_OP(init_vcpu, v);
+       return r;
+}
+
+void sched_destroy_vcpu(struct kvm_vcpu *v)
+{
+       int cpu = get_cpu();
+       SCHED_OP(destroy_vcpu, v);
+       put_cpu();
+}
+struct kvm* get_kvm_by_id(int id)
+{
+
+    struct kvm *kvm, *n;
+    spin_lock(&sched_vm_list_lock);
+    list_for_each_entry_safe(kvm, n, &sched_vm_list, vm_link) {
+       if (kvm->vmid == id)  {
+           spin_unlock(&sched_vm_list_lock);
+           return kvm;
+       }
+    }
+    return NULL;
+}
+static inline struct kvm* get_proc_kvm(struct file *file)
+{
+    struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+    vmid_t id;
+
+    if(strcmp(pde->name, "host") == 0)
+       return host_vm_kvm;
+
+    sscanf(pde->name, "%d", &id);
+    return get_kvm_by_id(id);
+}
+
+#define MAX_SCHEDULE_PARAMS 50
+static int kvm_scheduler_write(struct file *file, const char __user *buffer,
+       unsigned long count, void *data)
+{
+    char scheduler_param[MAX_SCHEDULE_PARAMS] = { '\0'};
+    struct kvm* kvm = get_proc_kvm(file);
+    int i;
+    int r = count;
+    unsigned long flags;
+
+    get_cpu();
+
+    if(!kvm) {
+       r = -EINVAL;
+       goto quit;
+    }
+
+    if (count > MAX_SCHEDULE_PARAMS - 1 ) {
+       r = -EINVAL;
+       goto quit;
+    }
+    if(copy_from_user(scheduler_param, buffer, count)) {
+       r = -EFAULT;
+       goto quit;
+    }
+    scheduler_param[count] = '\0';
+
+    for(i=0; i < KVM_MAX_VCPUS; i++){
+       struct kvm_vcpu *v = kvm->vcpus[i];
+       if(v && v != current_vcpu)
+           vcpu_pause(v);
+    }
+
+    if ( kvm == current_vcpu->kvm) {
+       local_irq_save(flags);
+       if(!vcpu_schedule_try_lock(current_vcpu)) {
+           local_irq_restore(flags);
+           r = -EAGAIN;
+           goto quit_with_unpause;
+       }
+    }
+
+    if ( (SCHED_OP(write_schedule_info, kvm, scheduler_param)) == 0 )
+       printk("failed write schedule info!\n");
+
+    if ( kvm == current_vcpu->kvm) {
+       vcpu_schedule_unlock(current_vcpu);
+       local_irq_restore(flags);
+    }
+
+quit_with_unpause:
+    for(i=0; i < KVM_MAX_VCPUS; i++){
+       struct kvm_vcpu *v = kvm->vcpus[i];
+       if(v && v != current_vcpu)
+           vcpu_unpause(v);
+    }
+quit:
+    put_cpu();
+    return r;
+}
+
+static ssize_t kvm_scheduler_read(struct file *file,
+       char __user * buffer, size_t count, loff_t *ppos)
+{
+    int res;
+    struct proc_dir_entry *pde = PDE(file->f_path.dentry->d_inode);
+    struct kvm *kvm;
+    int i;
+    char *msg;
+    unsigned long flags;
+
+    get_cpu();
+
+    msg = kmalloc(256, GFP_KERNEL);
+    if (!msg) {
+       res = -ENOMEM;
+       goto quit;
+    }
+
+    kvm = get_proc_kvm(file);
+    if(!kvm) {
+       sprintf(msg, "file /proc/%s/%s does not have a corresponding kvm!\n",
+               KVM_SCHEDULER, pde->name);
+       goto quit_with_msg;
+    }
+
+    for(i=0; i < KVM_MAX_VCPUS; i++){
+       struct kvm_vcpu *v = kvm->vcpus[i];
+       if(v && v != current_vcpu)
+           vcpu_pause(v);
+    }
+
+    if ( kvm == current_vcpu->kvm){
+       local_irq_save(flags);
+       if(!vcpu_schedule_try_lock(current_vcpu)) {
+           local_irq_restore(flags);
+           snprintf(msg, 256,
+                   "scheduler is locked by current domain. Try again!\n");
+           goto quit_with_unpause;
+       }
+    }
+
+    if ( (SCHED_OP(read_schedule_info, kvm, msg, 256)) == 0 )
+       printk("failed read schedule info!\n");
+
+    if ( kvm == current_vcpu->kvm) {
+       vcpu_schedule_unlock(current_vcpu);
+       local_irq_restore(flags);
+    }
+
+quit_with_unpause:
+    for(i=0; i < KVM_MAX_VCPUS; i++){
+       struct kvm_vcpu *v = kvm->vcpus[i];
+       if(v && v != current_vcpu)
+           vcpu_unpause(v);
+    }
+
+quit_with_msg:
+    res = simple_read_from_buffer(buffer, count, ppos, msg, strlen(msg));
+    kfree(msg);
+quit:
+    put_cpu();
+    return res;
+
+}
+static const struct file_operations kvm_scheduler_ops = {
+    .read = kvm_scheduler_read,
+    .write = kvm_scheduler_write,
+};
+int sched_init_vm(struct kvm *kvm)
+{
+    struct proc_dir_entry *entry;
+    int error = 0;
+    char name[MAX_PROCESSID_LEN];
+    int r;
+    int cpu = get_cpu();
+
+    BUG_ON(kvm->vmid);
+
+    if(unlikely(is_idle_vm(kvm)))
+       kvm->vmid = IDLE_VM_ID;
+    else if(unlikely(is_host_vm(kvm)))
+       kvm->vmid = HOST_VM_ID;
+    else {
+       kvm->vmid = (unsigned long)current->pid;
+       if(kvm->vmid > MAX_PROCESSID) {
+           printk("process id %lx too big. only the lower 32bits are used!\n",
+                   kvm->vmid);
+           kvm->vmid &= 0x0ffffffff;
+       }
+    }
+    spin_lock(&sched_vm_list_lock);
+    list_add(&kvm->vm_link, &sched_vm_list);
+    spin_unlock(&sched_vm_list_lock);
+
+    if(!is_idle_vm(kvm) && (kvm_scheduler_root != NULL)) {
+       if(!is_host_vm(kvm))
+           sprintf(name, "%d", kvm->vmid);
+       else
+           sprintf(name, "host");
+       entry = create_proc_entry(name, S_IRUGO | S_IWUGO | S_IFREG,
+               kvm_scheduler_root);
+       if (entry)
+           entry->proc_fops = &kvm_scheduler_ops;
+       else {
+           printk("failed to create vm %x's corresponding procfs\n",
kvm->vmid);
+           printk("user will not be able to control its scheduling!\n");
+           kvm->vmid = ANONY_VM_ID;
+       }
+    }
+
+    r = SCHED_OP(init_vm, kvm);
+    put_cpu();
+    return r;
+}
+
+void sched_destroy_vm(struct kvm *d)
+{
+    get_cpu();
+
+    char name[MAX_PROCESSID_LEN];
+
+    spin_lock(&sched_vm_list_lock);
+    list_del(&d->vm_link);
+    spin_unlock(&sched_vm_list_lock);
+
+    if(d->vmid){
+       if(is_host_vm(d))
+           sprintf(name, "host");
+       else
+           sprintf(name, "%d", d->vmid);
+       remove_proc_entry(name, kvm_scheduler_root);
+    }
+
+    SCHED_OP(destroy_vm, d);
+    put_cpu();
+}
+
+void vcpu_sleep_nosync(struct kvm_vcpu *v)
+{
+    unsigned long flags;
+    unsigned long my_flags;
+
+    get_cpu();
+    local_irq_save(my_flags);
+    vcpu_schedule_lock_irqsave(v, flags);
+
+    if ( likely(!vcpu_runnable(v)) )
+    {
+        if ( v->runstate.state == RUNSTATE_runnable )
+            vcpu_runstate_change(v, RUNSTATE_offline, NOW());
+
+        SCHED_OP(sleep, v);
+    }
+
+    vcpu_schedule_unlock_irqrestore(v, flags);
+    local_irq_restore(my_flags);
+    put_cpu();
+
+    TRACE_1D(TRC_SCHED_SLEEP, v->vcpu_id);
+}
+void sync_vcpu_execstate(struct kvm_vcpu *v)
+{
+    printk("TODO: func %s called. \n", __FUNCTION__);
+}
+void vcpu_sleep_sync(struct kvm_vcpu *v)
+{
+    vcpu_sleep_nosync(v);
+
+    while ( !vcpu_runnable(v) && v->is_running )
+        cpu_relax();
+
+    sync_vcpu_execstate(v);
+}
+
+void vcpu_wake(struct kvm_vcpu *v)
+{
+    unsigned long flags;
+    unsigned long my_flags;
+    int tmp_cpu = get_cpu();;
+    local_irq_save(my_flags);
+    vcpu_schedule_lock_irqsave(v, flags);
+    TRACE_2D(TRC_SCHED_WAKE, tmp_cpu, v);
+
+    if ( likely(vcpu_runnable(v)) )
+    {
+        if ( v->runstate.state >= RUNSTATE_blocked )
+            vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
+        SCHED_OP(wake, v);
+    }
+    else if ( !test_bit(_VPF_blocked, &v->pause_flags) )
+    {
+        if ( v->runstate.state == RUNSTATE_blocked )
+            vcpu_runstate_change(v, RUNSTATE_offline, NOW());
+    }
+
+    vcpu_schedule_unlock_irqrestore(v, flags);
+    local_irq_restore(my_flags);
+
+    TRACE_3D(TRC_SCHED_WAKE, tmp_cpu, v->vcpu_id, __LINE__);
+    put_cpu();
+}
+static void vcpu_migrate(struct kvm_vcpu *v)
+{
+    unsigned long flags;
+    unsigned long my_flags;
+    int old_cpu;
+    int tmp_cpu = get_cpu();;
+
+    local_irq_save(my_flags);
+    vcpu_schedule_lock_irqsave(v, flags);
+
+    /*
+     * NB. Check of v->running happens /after/ setting migration flag
+     * because they both happen in (different) spinlock regions, and those
+     * regions are strictly serialised.
+     */
+    if ( v->is_running ||
+         !test_and_clear_bit(_VPF_migrating, &v->pause_flags) )
+    {
+        vcpu_schedule_unlock_irqrestore(v, flags);
+       local_irq_restore(my_flags);
+       put_cpu();
+        return;
+    }
+
+    /* Switch to new CPU, then unlock old CPU. */
+    old_cpu = v->processor;
+    v->processor = SCHED_OP(pick_cpu, v);
+
+    /* because v->cpu is changed,
+     *we should not use vcpu_schedule_unlock_restore()
+     */
+    BUG_ON(old_cpu != tmp_cpu);
+    spin_unlock(&per_cpu(schedule_data, old_cpu).schedule_lock);
+    pseudo_irq_restore(flags);
+    local_irq_restore(my_flags);
+    /* Wake on new CPU. */
+    vcpu_wake(v);
+    put_cpu();
+}
+
+/*
+ * Force a VCPU through a deschedule/reschedule path.
+ * For example, using this when setting the periodic timer period means that
+ * most periodic-timer state need only be touched from within the scheduler
+ * which can thus be done without need for synchronisation.
+ */
+void vcpu_force_reschedule(struct kvm_vcpu *v)
+{
+    printk("TODO: func %s called\n", __FUNCTION__);
+}
+
+int vcpu_set_affinity(struct kvm_vcpu *v, cpumask_t *affinity)
+{
+    printk("TODO: func %s called\n", __FUNCTION__);
+    return 0;
+}
+
+/* Block the currently-executing domain until a pertinent event occurs. */
+static long do_block(void)
+{
+    printk("TODO: func %s called\n", __FUNCTION__);
+    return 0;
+}
+
+/* Voluntarily yield the processor for this allocation. */
+static long do_yield(void)
+{
+    TRACE_1D(TRC_SCHED_YIELD, current_vcpu->vcpu_id);
+    tasklet_schedule(&per_cpu(schedule_data,
raw_smp_processor_id()).sched_tasklet);
+    return 0;
+}
+long do_sched_op_compat(int cmd, unsigned long arg)
+{
+    long ret = 0;
+    printk("TODO: unimplemented func %s \n", __FUNCTION__);
+    return ret;
+}
+
+typedef long ret_t;
+
+#endif /* !COMPAT */
+
+#ifndef COMPAT
+
+/* sched_id - fetch ID of current scheduler */
+int sched_id(void)
+{
+    return ops.sched_id;
+}
+
+
+void continue_running(struct kvm_vcpu *vcpu)
+{
+}
+void context_saved(struct kvm_vcpu *prev)
+{
+       prev->is_running = 0;
+       cmpxchg(&prev->status, VCPU_RUNNING, VCPU_YIELD);
+
+       if(unlikely(test_bit(_VPF_migrating, &prev->pause_flags))){
+               vcpu_migrate(prev);
+       }
+}
+void vm_context_switch(struct kvm_vcpu *prev, struct kvm_vcpu *next)
+{
+       context_saved(prev);
+       cmpxchg(&next->status, VCPU_YIELD, VCPU_RUNNING);
+       wake_up(&next->wq);
+}
+/* Return Value: 0
+ * meaning: upon completion, the pseudo_irq remains disabled
+ */
+static int __vcpu_schedule(int my_cpu)
+{
+    struct kvm_vcpu          *prev = current_vcpu, *next = NULL;
+    s_time_t           now;
+    ktime_t            ktime_now;
+    struct schedule_data *sd;
+    struct task_slice     next_slice;
+    s32                   r_time;     /* time for new dom to run */
+    static int cnt=0;
+    unsigned long tmp_flags;
+
+    BUG_ON(thread_preemptible());
+
+    ASSERT(!in_irq());
+
+    sd = &per_cpu(schedule_data, my_cpu);
+
+    /* if the kvm module is to be unloaded, the lock might fail */
+    if(cmpxchg(&sd->in_use, false, true) != false)
+       return 0;
+
+    spin_lock(&sd->schedule_lock);
+
+    hrtimer_cancel(&sd->s_timer);
+    ktime_now = sd->s_timer.base->get_time();
+    now = ktime_to_ns(ktime_now);
+
+    /* get policy-specific decision on scheduling... */
+    next_slice = ops.do_schedule(now);
+
+    r_time = next_slice.time;
+    next = next_slice.task;
+
+    sd->curr = next;
+
+    if(unlikely(shutting_down)) {
+       spin_unlock(&sd->schedule_lock);
+       goto finished;
+    }
+
+    if ( unlikely(prev == next) )
+    {
+           spin_unlock(&sd->schedule_lock);
+
+           continue_running(prev);
+           goto finished;
+    }
+
+    TRACE_1D(TRC_SCHED_SWITCH_INFPREV,
+             // prev->domain->domain_id,
+             now - prev->runstate.state_entry_time);
+    TRACE_2D(TRC_SCHED_SWITCH_INFNEXT,
+             // next->domain->domain_id,
+             (next->runstate.state == RUNSTATE_runnable) ?
+             (now - next->runstate.state_entry_time) : 0,
+             r_time);
+
+    ASSERT(prev->runstate.state == RUNSTATE_running);
+    vcpu_runstate_change(
+        prev,
+        (test_bit(_VPF_blocked, &prev->pause_flags) ? RUNSTATE_blocked :
+         (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
+        now);
+
+    ASSERT(next->runstate.state != RUNSTATE_running);
+    vcpu_runstate_change(next, RUNSTATE_running, now);
+
+    ASSERT(!next->is_running);
+    next->is_running = 1;
+
+    spin_unlock(&sd->schedule_lock);
+
+    TRACE_3D(TRC_SCHED_SWITCH, __LINE__, prev->vcpu_id, next->vcpu_id);
+    vm_context_switch(prev, next);
+finished:
+    hrtimer_cancel(&sd->watchdog);
+    if (!shutting_down) {
+       hrtimer_start(&sd->s_timer, ktime_add_ns(ktime_now, r_time),
HRTIMER_MODE_ABS);
+
+       /* restart the watchdog */
+       ktime_now = sd->watchdog.base->get_time();
+       hrtimer_start(&sd->watchdog, ktime_add_ns(ktime_now, WATCHDOG_NS),
HRTIMER_MODE_ABS);
+    }
+    sd->in_use = false;
+    return 0;
+}
+/*
+ * The main function
+ * - deschedule the current domain (scheduler independent).
+ * - pick a new domain (scheduler dependent).
+ */
+static void vcpu_schedule(unsigned long _unused)
+{
+
+    int my_cpu = raw_smp_processor_id();
+    // tasklet_disable(&per_cpu(schedule_data, my_cpu).tick_tasklet);
+try_again:
+    while(per_cpu(schedule_data, my_cpu).sched_state == SCHEDULER_USER)
+       ;
+    if(per_cpu(schedule_data, my_cpu).sched_state == SCHEDULER_KERNEL) {
+       /* for the case that others has entered the critical section
+        * we abort scheduling the vcpus this time
+        */
+       struct schedule_data *sd = &per_cpu(schedule_data, my_cpu);
+       ktime_t now = sd->s_timer.base->get_time();
+
+       hrtimer_cancel(&sd->s_timer);
+       if(!shutting_down)
+           hrtimer_start(&sd->s_timer, ktime_add_ns(now, 10000),
HRTIMER_MODE_ABS);
+       // tasklet_enable(&per_cpu(schedule_data, my_cpu).tick_tasklet);
+       return;
+    }
+    if(cmpxchg(&per_cpu(schedule_data, my_cpu).sched_state,
+               SCHEDULER_FREE, SCHEDULER_KERNEL) != SCHEDULER_FREE)
+       goto try_again;
+
+    preempt_disable();
+    __vcpu_schedule(my_cpu);
+    per_cpu(schedule_data, my_cpu).sched_state = SCHEDULER_FREE;
+
+    // tasklet_enable(&per_cpu(schedule_data, my_cpu).tick_tasklet);
+    preempt_enable();
+    // put_cpu();
+
+    return;
+}
+
+void kvm_force_tasklet_schedule(void *_unused)
+{
+    get_cpu();
+    tasklet_schedule(&per_cpu(schedule_data,
raw_smp_processor_id()).sched_tasklet);
+    put_cpu();
+}
+/* The scheduler timer: force a run through the scheduler */
+static enum hrtimer_restart s_timer_fn(struct hrtimer* timer)
+{
+    int cpu = raw_smp_processor_id();
+    struct schedule_data *sd = container_of(timer, struct
schedule_data, s_timer);
+    if(cpu != sd->id){
+       int r;
+       r = insert_pending_ipi(cpu, cpumask_of_cpu(sd->id),
kvm_force_tasklet_schedule, NULL, 1);
+       // tasklet_schedule(&sd->ipi_tasklet);
+       wake_up(&sd->ipi_wq);
+       if(r){
+           dump_traces(NULL);
+           BUG_ON(1);
+       }
+    }else{
+       per_cpu(schedule_data, raw_smp_processor_id()).can_migrate =
!in_atomic_preempt_off();
+       tasklet_schedule(&per_cpu(schedule_data, cpu).sched_tasklet);
+    }
+    /* no need to restart the timer. the vcpu_schedule() will do it */
+    return HRTIMER_NORESTART;
+}
+static void per_cpu_stop_sched(int cpu)
+{
+    struct schedule_data* sd = &per_cpu(schedule_data, cpu);
+    hrtimer_cancel(&sd->s_timer);
+    hrtimer_cancel(&sd->watchdog);
+}
+static void per_cpu_kill_scheduler(int cpu)
+{
+    struct schedule_data* sd = &per_cpu(schedule_data, cpu);
+    tasklet_kill(&sd->sched_tasklet);
+
+}
+
+bool shutting_down = false;
+void stop_auto_schedule(void)
+{
+    int i;
+
+    shutting_down = true;
+    barrier();
+
+    for_each_online_cpu(i) {
+       struct schedule_data *sd = &per_cpu(schedule_data, i);
+
+       sd->ipi_quit = true;
+       wake_up(&sd->ipi_wq);
+       printk("waiting ipi helper stop!\n");
+       while(sd->ipi_quit) schedule();
+       printk("ipi helper stopped!\n");
+       SCHED_OP(stop_schedule, i);
+       per_cpu_stop_sched(i);
+
+    }
+}
+static inline void scheduler_disabled(int cpu)
+{
+    struct schedule_data *sd = &per_cpu(schedule_data, cpu);
+    per_cpu_kill_scheduler(cpu);
+}
+void wait_scheduler_stops(void)
+{
+    int i;
+    for_each_online_cpu(i) {
+       SCHED_OP(disable_scheduler, i);
+       scheduler_disabled(i);
+    }
+}
+void scheduler_start(void)
+{
+    int i;
+
+    for_each_online_cpu(i)  {
+       if(SCHED_OP(start_scheduler, i)) {
+           printk("error start scheduler %d!\n", i);
+           BUG_ON(1);
+       }
+    }
+}
+void scheduler_destroy(void)
+{
+    int i;
+
+    if(kvm_scheduler_root)
+       remove_proc_entry("kvm", NULL);
+
+    for_each_online_cpu(i) {
+       destroy_pending_ipi_buf(i);
+    }
+}
+
+extern void inject_pending_ipi(void);
+static void per_cpu_init_sched(int cpu)
+{
+       struct pending_work * pd_work;
+       ktime_t now;
+       struct schedule_data *sd = &per_cpu(schedule_data, cpu);
+       struct task_struct *task;
+       char name[]="ipi_helper_0";
+
+       sd->sched_state = SCHEDULER_FREE;
+       tasklet_init(&sd->sched_tasklet, vcpu_schedule, (unsigned long)NULL);
+
+       name[strlen(name)-1] += cpu;
+       init_waitqueue_head(&sd->ipi_wq);
+       sd->ipi_quit = false;
+       task = kthread_run(inject_pending_ipi, cpu, name);
+       if(IS_ERR(task)){
+           printk("error creating help thread %d\n", cpu);
+           BUG_ON(1);
+       }
+       kvm_sched_setaffinity(task->pid, cpumask_of_cpu(cpu));
+
+       rwlock_init(&per_cpu(pseudo_cli, cpu));
+       spin_lock_init(&sd->schedule_lock);
+       sd->in_use = false;
+       sd->can_migrate = false;
+       sd->id = cpu;
+
+       /* the scheduler timer */
+       hrtimer_init(&sd->s_timer,
+               CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+       sd->s_timer.function = s_timer_fn;
+       // hrtimer_data_pointer(&sd->s_timer);
+
+       /* the watchdog timer */
+       hrtimer_init(&sd->watchdog,
+               CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+       sd->watchdog.function = dump_cpu_trace;
+       // hrtimer_data_pointer(&sd->watchdog);
+
+       /* start the watchdog */
+       now = sd->watchdog.base->get_time();
+       if(hrtimer_start(&sd->watchdog,
+                   ktime_add_ns(now, WATCHDOG_NS), HRTIMER_MODE_ABS)) {
+           printk("start watchdog timer failed!\n");
+           BUG_ON(1);
+       }
+}
+
+/* Initialise the data structures. */
+void scheduler_init(void)
+{
+       int i;
+
+       INIT_LIST_HEAD(&sched_vm_list);
+       spin_lock_init(&sched_vm_list_lock);
+
+       kvm_scheduler_root = proc_mkdir(KVM_SCHEDULER, NULL);
+       if (!kvm_scheduler_root){
+           printk("fail to create the /proc/kvm procfs!\n");
+           printk("the user scheduler control function will be disabled!\n");
+       }
+
+
+       for_each_online_cpu(i)
+           per_cpu_init_sched(i);
+
+       ops = *schedulers[0];
+       SCHED_OP(init);
+}
+
+void dump_runq(unsigned char key)
+{
+    s_time_t      now = NOW();
+    int           i;
+    unsigned long flags;
+
+    local_irq_save(flags);
+
+    printk("Scheduler: %s (%s)\n", ops.name, ops.opt_name);
+    SCHED_OP(dump_settings);
+    printk("NOW=0x%08X%08X\n",  (u32)(now>>32), (u32)now);
+
+    for_each_online_cpu ( i )
+    {
+        spin_lock(&per_cpu(schedule_data, i).schedule_lock);
+        printk("CPU[%02d] ", i);
+        SCHED_OP(dump_cpu_state, i);
+        spin_unlock(&per_cpu(schedule_data, i).schedule_lock);
+    }
+
+    local_irq_restore(flags);
+}
+
+#endif /* !COMPAT */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at  http://vger.kernel.org/majordomo-info.html