I have attached the diff On Mon, May 11, 2009 at 8:16 PM, Mulyadi Santosa <mulyadi.santosa@xxxxxxxxx> wrote: > Hi... > > On Mon, May 11, 2009 at 4:22 PM, Sukanto Ghosh > <sukanto.cse.iitb@xxxxxxxxx> wrote: >> Hi, >> >> I was adding a field to the 3 trace_mark() calls in kernel/sched.c. >> The calls are at: i) context_switch(prev, next, ... ) ii) >> try_to_wake_up(p, ...) iii) wake_up_new_task(p, ...) functions >> >> The field is 'task'->mm->pfrate. (I have added the pfrate field in >> mm_struct), where 'task' is a placeholder for prev/next/p/rq->curr in >> those trace_mark() calls. I found that always either the next/p >> pointers are NULL or task->mm is NULL at that particular point. Is it >> supposed to be so ? Why ? >> >> PS: The trace_mark() in kernel/sched.c calls are for adding entries to >> the trace-file of sched_switch tracer. > > Always? hmm strange. sure you did the deferencing to task_struct correctly? > > perhaps you could share to us the change you made? in a form of diff -u perhaps? > > regards, > > Mulyadi. > -- Regards, Sukanto Ghosh
--- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -224,10 +225,25 @@ struct mm_struct { * it has been since this task got the token. * Look at mm/thrash.c */ - unsigned int faultstamp; - unsigned int token_priority; - unsigned int last_interval; + u64 last_logical_faultstamp; /* global faultstamp value at last major fault */ + u64 last_interval; /* in jiffies */ + unsigned long long last_interval_ns; /* in nanoseconds (for intervals < 1 jiffie) */ + u64 last_logical_interval; /* in global faults count */ + unsigned int mm_maj_flts; + u64 page_flt_rate; /* unit: (ns)^(-1) */ + u64 last_faultstamp_jiffies64; /* jiffies of last major fault */ + cycles_t last_faultstamp_cycles; /* clock_value of last major fault */ + u64 token_expiry_jiffies64; /* expiry time of the swap_token */ + u64 creation_jiffies64; /* time at which the mm_struct was created */ + + + /* boolean value: whether this process has a swap token or not */ + unsigned int has_swap_token; + + spinlock_t swap_token_lock; + struct rb_node token_tree_node; + unsigned int token_priority; unsigned long flags; /* Must use atomic bitops to access the bits */ struct core_state *core_state; /* coredumping support */ diff --git a/include/linux/swap.h b/include/linux/swap.h index de40f16..d742a0e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -254,24 +254,28 @@ extern int remove_exclusive_swap_page(struct page *); struct backing_dev_info; /* linux/mm/thrash.c */ +extern int honor_swap_token; extern struct mm_struct * swap_token_mm; -extern void grab_swap_token(void); +extern void grab_swap_token(struct mm_struct *); extern void __put_swap_token(struct mm_struct *); static inline int has_swap_token(struct mm_struct *mm) { - return (mm == swap_token_mm); + return ((mm->has_swap_token)==1); } static inline void put_swap_token(struct mm_struct *mm) { + spin_lock(&(mm->swap_token_lock)); if (has_swap_token(mm)) __put_swap_token(mm); + spin_unlock(&(mm->swap_token_lock)); } static inline void disable_swap_token(void) { - put_swap_token(swap_token_mm); +// put_swap_token(swap_token_mm); +// honor_swap_token = 0; } #else /* CONFIG_SWAP */ diff --git a/kernel/fork.c b/kernel/fork.c index 7ce2ebe..8aed6dd 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -567,8 +567,19 @@ struct mm_struct *dup_mm(struct task_struct *tsk) memcpy(mm, oldmm, sizeof(*mm)); /* Initializing for Swap token stuff */ - mm->token_priority = 0; +// mm->token_priority = 0; mm->last_interval = 0; + mm->last_logical_interval = 0; + mm->has_swap_token = 0; + mm->last_logical_faultstamp = 0; + mm->mm_maj_flts = 0; + mm->page_flt_rate = 0; + mm->creation_jiffies64 = get_jiffies_64(); + mm->last_faultstamp_jiffies64 = mm->creation_jiffies64; + mm->last_faultstamp_cycles = get_cycles(); + mm->last_interval_ns = 0; + + spin_lock_init(&(mm->swap_token_lock)); if (!mm_init(mm, tsk)) goto fail_nomem; diff --git a/kernel/sched.c b/kernel/sched.c index ad1962d..00731d4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2283,8 +2283,8 @@ out_activate: out_running: trace_mark(kernel_sched_wakeup, - "pid %d state %ld ## rq %p task %p rq->curr %p", - p->pid, p->state, rq, p, rq->curr); + "pid %d state %ld ## rq %p task %p rq->curr %p rq_load %lu page_flt_rate %llu", + p->pid, p->state, rq, p, rq->curr, rq->load.weight, p->mm->page_flt_rate); check_preempt_curr(rq, p); p->state = TASK_RUNNING; @@ -2418,8 +2418,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) inc_nr_running(rq); } trace_mark(kernel_sched_wakeup_new, - "pid %d state %ld ## rq %p task %p rq->curr %p", - p->pid, p->state, rq, p, rq->curr); + "pid %d state %ld ## rq %p task %p rq->curr %p rq_load %lu page_flt_rate %llu", + p->pid, p->state, rq, p, rq->curr, rq->load.weight, p->mm->page_flt_rate); check_preempt_curr(rq, p); #ifdef CONFIG_SMP if (p->sched_class->task_wake_up) @@ -2594,9 +2594,9 @@ context_switch(struct rq *rq, struct task_struct *prev, prepare_task_switch(rq, prev, next); trace_mark(kernel_sched_schedule, "prev_pid %d next_pid %d prev_state %ld " - "## rq %p prev %p next %p", + "## rq %p prev %p next %p rq_load %lu page_flt_rate %llu", prev->pid, next->pid, prev->state, - rq, prev, next); + rq, prev, next, rq->load.weight, next->mm->page_flt_rate); mm = next->mm; oldmm = prev->active_mm; /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 50ec088..e187896 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -83,6 +83,9 @@ extern int compat_log; extern int maps_protect; extern int latencytop_enabled; extern int sysctl_nr_open_min, sysctl_nr_open_max; +/* Swap-token related */ +extern int vm_token_validity_period_ms; +extern int max_swap_token_frac; #ifdef CONFIG_RCU_TORTURE_TEST extern int rcutorture_runnable; #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ @@ -855,6 +858,23 @@ static struct ctl_table kern_table[] = { }; static struct ctl_table vm_table[] = { + /* swap token memory management controls */ + { + .ctl_name = CTL_UNNUMBERED, + .procname = "swap_token_validity_ms", + .data = &vm_token_validity_period_ms, + .maxlen = sizeof(vm_token_validity_period_ms), + .mode = 0644, + .proc_handler = &proc_dointvec + }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "max_swap_token_holder_frac", + .data = &max_swap_token_frac, + .maxlen = sizeof(max_swap_token_frac), + .mode = 0644, + .proc_handler = &proc_dointvec + }, { .ctl_name = VM_OVERCOMMIT_MEMORY, .procname = "overcommit_memory", diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 8f3fb3d..2311486 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -948,7 +948,9 @@ tracing_sched_switch_trace(struct trace_array *tr, struct trace_array_cpu *data, struct task_struct *prev, struct task_struct *next, - unsigned long flags) + unsigned long flags, + unsigned long rq_load, + u64 page_flt_rate) { struct trace_entry *entry; unsigned long irq_flags; @@ -964,6 +966,8 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->ctx.next_pid = next->pid; entry->ctx.next_prio = next->prio; entry->ctx.next_state = next->state; + entry->ctx.rq_load = rq_load; + entry->ctx.page_flt_rate = page_flt_rate; __trace_stack(tr, data, flags, 5); __raw_spin_unlock(&data->lock); raw_local_irq_restore(irq_flags); @@ -974,7 +978,9 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct trace_array_cpu *data, struct task_struct *wakee, struct task_struct *curr, - unsigned long flags) + unsigned long flags, + unsigned long rq_load, + u64 page_flt_rate) { struct trace_entry *entry; unsigned long irq_flags; @@ -990,6 +996,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->ctx.next_pid = wakee->pid; entry->ctx.next_prio = wakee->prio; entry->ctx.next_state = wakee->state; + entry->ctx.rq_load = rq_load; + entry->ctx.page_flt_rate = page_flt_rate; __trace_stack(tr, data, flags, 6); __raw_spin_unlock(&data->lock); raw_local_irq_restore(irq_flags); @@ -1524,13 +1532,16 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu) state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 1 : 0; S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X'; comm = trace_find_cmdline(entry->ctx.next_pid); - trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n", + trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %lu %llu %s\n", entry->ctx.prev_pid, entry->ctx.prev_prio, S, entry->type == TRACE_CTX ? "==>" : " +", entry->ctx.next_pid, entry->ctx.next_prio, - T, comm); + T, + entry->ctx.rq_load, + entry->ctx.page_flt_rate, + comm); break; case TRACE_SPECIAL: trace_seq_printf(s, "# %ld %ld %ld\n", @@ -1611,14 +1622,16 @@ static int print_trace_fmt(struct trace_iterator *iter) state_to_char[entry->ctx.prev_state] : 'X'; T = entry->ctx.next_state < sizeof(state_to_char) ? state_to_char[entry->ctx.next_state] : 'X'; - ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n", + ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %lu %llu\n", entry->ctx.prev_pid, entry->ctx.prev_prio, S, entry->type == TRACE_CTX ? "==>" : " +", entry->ctx.next_pid, entry->ctx.next_prio, - T); + T, + entry->ctx.rq_load, + entry->ctx.page_flt_rate); if (!ret) return 0; break; @@ -1679,13 +1692,15 @@ static int print_raw_fmt(struct trace_iterator *iter) state_to_char[entry->ctx.next_state] : 'X'; if (entry->type == TRACE_WAKE) S = '+'; - ret = trace_seq_printf(s, "%d %d %c %d %d %c\n", + ret = trace_seq_printf(s, "%d %d %c %d %d %c %lu %llu\n", entry->ctx.prev_pid, entry->ctx.prev_prio, S, entry->ctx.next_pid, entry->ctx.next_prio, - T); + T, + entry->ctx.rq_load, + entry->ctx.page_flt_rate); if (!ret) return 0; break; @@ -1783,6 +1798,8 @@ static int print_bin_fmt(struct trace_iterator *iter) SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid); SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio); SEQ_PUT_FIELD_RET(s, entry->ctx.next_state); + SEQ_PUT_FIELD_RET(s, entry->ctx.rq_load); + SEQ_PUT_FIELD_RET(s, entry->ctx.page_flt_rate); break; case TRACE_SPECIAL: case TRACE_STACK: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f69f867..aafa40e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -39,6 +39,8 @@ struct ctx_switch_entry { unsigned int next_pid; unsigned char next_prio; unsigned char next_state; + unsigned long rq_load; + u64 page_flt_rate; }; /* @@ -204,14 +206,18 @@ void tracing_sched_switch_trace(struct trace_array *tr, struct trace_array_cpu *data, struct task_struct *prev, struct task_struct *next, - unsigned long flags); + unsigned long flags, + unsigned long rq_load, + u64 page_flt_rate); void tracing_record_cmdline(struct task_struct *tsk); void tracing_sched_wakeup_trace(struct trace_array *tr, struct trace_array_cpu *data, struct task_struct *wakee, struct task_struct *cur, - unsigned long flags); + unsigned long flags, + unsigned long rq_load, + u64 page_flt_rate); void trace_special(struct trace_array *tr, struct trace_array_cpu *data, unsigned long arg1, diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index cb817a2..93b0412 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -20,7 +20,7 @@ static atomic_t sched_ref; static void sched_switch_func(void *private, void *__rq, struct task_struct *prev, - struct task_struct *next) + struct task_struct *next, unsigned long rq_load, u64 page_flt_rate) { struct trace_array **ptr = private; struct trace_array *tr = *ptr; @@ -40,8 +40,10 @@ sched_switch_func(void *private, void *__rq, struct task_struct *prev, data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); +// rq_load = ((struct rq*)__rq)->load.weight; + if (likely(disabled == 1)) - tracing_sched_switch_trace(tr, data, prev, next, flags); + tracing_sched_switch_trace(tr, data, prev, next, flags, rq_load, page_flt_rate); atomic_dec(&data->disabled); local_irq_restore(flags); @@ -54,6 +56,8 @@ sched_switch_callback(void *probe_data, void *call_data, struct task_struct *prev; struct task_struct *next; struct rq *__rq; + unsigned long rq_load; + u64 page_flt_rate; if (!atomic_read(&sched_ref)) return; @@ -65,17 +69,19 @@ sched_switch_callback(void *probe_data, void *call_data, __rq = va_arg(*args, typeof(__rq)); prev = va_arg(*args, typeof(prev)); next = va_arg(*args, typeof(next)); + rq_load = va_arg(*args, typeof(rq_load)); + page_flt_rate = va_arg(*args, typeof(page_flt_rate)); /* * If tracer_switch_func only points to the local * switch func, it still needs the ptr passed to it. */ - sched_switch_func(probe_data, __rq, prev, next); + sched_switch_func(probe_data, __rq, prev, next, rq_load, page_flt_rate); } static void wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct - task_struct *curr) + task_struct *curr, unsigned long rq_load, u64 page_flt_rate) { struct trace_array **ptr = private; struct trace_array *tr = *ptr; @@ -94,8 +100,10 @@ wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct data = tr->data[cpu]; disabled = atomic_inc_return(&data->disabled); +// rq_load = ((struct rq*)__rq)->load.weight; + if (likely(disabled == 1)) - tracing_sched_wakeup_trace(tr, data, wakee, curr, flags); + tracing_sched_wakeup_trace(tr, data, wakee, curr, flags, rq_load, page_flt_rate); atomic_dec(&data->disabled); local_irq_restore(flags); @@ -108,6 +116,8 @@ wake_up_callback(void *probe_data, void *call_data, struct task_struct *curr; struct task_struct *task; struct rq *__rq; + unsigned long rq_load; + u64 page_flt_rate; if (likely(!tracer_enabled)) return; @@ -119,11 +129,13 @@ wake_up_callback(void *probe_data, void *call_data, __rq = va_arg(*args, typeof(__rq)); task = va_arg(*args, typeof(task)); curr = va_arg(*args, typeof(curr)); + rq_load = va_arg(*args, typeof(rq_load)); + page_flt_rate = va_arg(*args, typeof(page_flt_rate)); tracing_record_cmdline(task); tracing_record_cmdline(curr); - wakeup_func(probe_data, __rq, task, curr); + wakeup_func(probe_data, __rq, task, curr, rq_load, page_flt_rate); } static void sched_switch_reset(struct trace_array *tr) @@ -141,7 +153,7 @@ static int tracing_sched_register(void) int ret; ret = marker_probe_register("kernel_sched_wakeup", - "pid %d state %ld ## rq %p task %p rq->curr %p", + "pid %d state %ld ## rq %p task %p rq->curr %p rq_load %lu page_flt_rate %llu", wake_up_callback, &ctx_trace); if (ret) { @@ -151,7 +163,7 @@ static int tracing_sched_register(void) } ret = marker_probe_register("kernel_sched_wakeup_new", - "pid %d state %ld ## rq %p task %p rq->curr %p", + "pid %d state %ld ## rq %p task %p rq->curr %p rq_load %lu page_flt_rate %llu", wake_up_callback, &ctx_trace); if (ret) { @@ -162,7 +174,7 @@ static int tracing_sched_register(void) ret = marker_probe_register("kernel_sched_schedule", "prev_pid %d next_pid %d prev_state %ld " - "## rq %p prev %p next %p", + "## rq %p prev %p next %p rq_load %lu page_flt_rate %llu", sched_switch_callback, &ctx_trace); if (ret) { diff --git a/mm/memory.c b/mm/memory.c index 1002f47..ccc6eee 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2268,7 +2268,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry); if (!page) { - grab_swap_token(); /* Contend for token _before_ read-in */ + /* Contend for token _before_ read-in */ + if(honor_swap_token) + grab_swap_token(mm); page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, address); if (!page) { diff --git a/mm/rmap.c b/mm/rmap.c index e8d639b..8fb0ed5 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -317,11 +317,16 @@ static int page_referenced_one(struct page *page, } else if (ptep_clear_flush_young_notify(vma, address, pte)) referenced++; - /* Pretend the page is referenced if the task has the - swap token and is in the middle of a page fault. */ - if (mm != current->mm && has_swap_token(mm) && - rwsem_is_locked(&mm->mmap_sem)) - referenced++; + /* If the task has an invalid swap token, revoke it. + Otherwise, pretend the page is referenced */ + spin_lock(&(mm->swap_token_lock)); + if ((mm != current->mm) && has_swap_token(mm)) { + if(time_after64(get_jiffies_64(), mm->token_expiry_jiffies64)) + __put_swap_token(mm); + else + referenced++; + } + spin_unlock(&(mm->swap_token_lock)); (*mapcount)--; pte_unmap_unlock(pte, ptl); diff --git a/mm/thrash.c b/mm/thrash.c index c4c5205..222ce54 100644 --- a/mm/thrash.c +++ b/mm/thrash.c @@ -21,59 +21,244 @@ #include <linux/mm.h> #include <linux/sched.h> #include <linux/swap.h> +#include <linux/rbtree.h> +#include <linux/timex.h> +#include <linux/cpufreq.h> +#include <linux/clocksource.h> static DEFINE_SPINLOCK(swap_token_lock); struct mm_struct *swap_token_mm; static unsigned int global_faults; -void grab_swap_token(void) +int honor_swap_token = 1; + +struct rb_root token_tree_root = RB_ROOT; +struct rb_node *token_tree_leftmost = NULL; +int num_swap_token_holders = 0; + +/* maximum allowed fraction of swap-token holders */ +int max_swap_token_frac = 50; +/* number of msec for which the token is valid */ +int vm_token_validity_period_ms = 10; + + +void token_tree_insert(struct mm_struct *mm) { + + struct rb_node **new = &(token_tree_root.rb_node), *parent = NULL; + int leftmost = 1; + + while(*new) { + struct mm_struct *this = container_of(*new, struct mm_struct, token_tree_node); + + parent = *new; + if(mm->page_flt_rate < this->page_flt_rate) + new = &((*new)->rb_left); +// else if(!((mm->page_flt_rate == this->page_flt_rate) && (mm == this))) { + else if(mm != this) { + new = &((*new)->rb_right); + leftmost = 0; + } + else + return; + } + + if(leftmost) + token_tree_leftmost = &(mm->token_tree_node); + + rb_link_node(&(mm->token_tree_node), parent, new); + rb_insert_color(&(mm->token_tree_node), &token_tree_root); +} + +void token_tree_delete(struct mm_struct* mm) { + + if(&(mm->token_tree_node) == token_tree_leftmost) { + struct rb_node* next_node = rb_next(token_tree_leftmost); + token_tree_leftmost = next_node; + } + rb_erase(&(mm->token_tree_node), &token_tree_root); +} +/* 64-bit division [copied from x86/include/div64.h] + * + * modifies the dividend to store the quotient */ +#define my_do_div64(n, base) \ + ({ \ + unsigned long __upper, __low, __high, __mod, __base; \ + __base = (base); \ + asm("":"=a" (__low), "=d" (__high) : "A" (n)); \ + __upper = __high; \ + if (__high) { \ + __upper = __high % (__base); \ + __high = __high / (__base); \ + } \ + asm("divl %2":"=a" (__low), "=d" (__mod) \ + : "rm" (__base), "0" (__low), "1" (__upper)); \ + asm("":"=A" (n) : "a" (__low), "d" (__high)); \ + __mod; \ + }) + +#define sg_jiffies_2_ns(x) (x * 1000000LLU * (1000/HZ)) + +/*TODO: to add clock_t get_cycles(void) for timing measurements when time elapsed + * between successive page faults is < 1 jiffies + * + * use 'unsigned int cpufreq_quick_get(unsigned int cpu)' for getting the cpu freq + * in KHz unit + * + * conversion from number of clocks to nanosecond: + * + * x clocks = ((x * 10^6)/cpufreq) ns + * + */ + +void grab_swap_token(struct mm_struct *cur_mm) { - int current_interval; +// struct mm_struct *cur_mm; + struct mm_struct *leftmost_mm = NULL; + int current_logical_interval; + u64 current_interval; + unsigned long long current_interval_ns = 0; + int cur_swap_token_frac; + + int nr_total; +// u64 duration; + u64 a,b,c,d,e,f; + u64 prev_page_flt_rate; + +// cur_mm = current->mm; + + // global lock + spin_lock(&(swap_token_lock)); + global_faults++; + current_logical_interval = global_faults - cur_mm->last_logical_faultstamp; + + spin_lock(&(cur_mm->swap_token_lock)); + + cur_mm->mm_maj_flts++; + /* calculate the page fault rate */ + + current_interval = (get_jiffies_64() - cur_mm->last_faultstamp_jiffies64); + if(current_interval == 0LLU) { /* interval less than 1 jiffie */ + current_interval_ns = cyc2ns(clock, (long)get_cycles() - (long)cur_mm->last_faultstamp_cycles); + } + + cur_mm->last_faultstamp_jiffies64 = get_jiffies_64(); + cur_mm->last_faultstamp_cycles = get_cycles(); + + prev_page_flt_rate = cur_mm->page_flt_rate; + +#define MULTIPLIER 1000000LLU + if(cur_mm->last_interval == 0) { + cur_mm->page_flt_rate = MULTIPLIER; + if(current_interval) { + a = sg_jiffies_2_ns(current_interval); + my_do_div64(cur_mm->page_flt_rate, a); + } + else + my_do_div64(cur_mm->page_flt_rate, current_interval_ns); + + } + else { + a = 25LLU*cur_mm->page_flt_rate; + b = 100LLU; + c = 35*MULTIPLIER; - current_interval = global_faults - current->mm->faultstamp; + if(cur_mm->last_interval) + d = 100LLU * sg_jiffies_2_ns(cur_mm->last_interval); + else + d = 100LLU * cur_mm->last_interval_ns; - if (!spin_trylock(&swap_token_lock)) - return; + e = 40*MULTIPLIER; - /* First come first served */ - if (swap_token_mm == NULL) { - current->mm->token_priority = current->mm->token_priority + 2; - swap_token_mm = current->mm; - goto out; + if(current_interval) + f = 100LLU * sg_jiffies_2_ns(current_interval); + else + f = 100LLU * current_interval_ns; + + printk(KERN_NOTICE "SGDEBUG: f becomes %llu\n", f); + my_do_div64(a,b); + my_do_div64(c,d); + my_do_div64(e,f); + cur_mm->page_flt_rate = (a+c+e); } +#undef MULTIPLIER - if (current->mm != swap_token_mm) { - if (current_interval < current->mm->last_interval) - current->mm->token_priority++; + + printk(KERN_NOTICE "SGDEBUG: [pid:%u] # page-faults: %u; current_interval %llu; pg-flt rate: %llu \n", current->pid, cur_mm->mm_maj_flts, current_interval, cur_mm->page_flt_rate); + + /* calculate the current fraction of token_holders */ + nr_total = nr_running(); + cur_swap_token_frac = (num_swap_token_holders *100) / nr_total; + + printk(KERN_NOTICE "SGDEBUG: [pid:%u] Total proc: %d ; # Holders: %d ; Holder-frac: %d \n", current->pid, nr_total, num_swap_token_holders, cur_swap_token_frac); + + /* the leftmost node in the rb-tree, with the least page-fault rate */ + if( token_tree_leftmost != NULL) + leftmost_mm = container_of(token_tree_leftmost, struct mm_struct, token_tree_node); + + /* check if 'eligible' to get the token */ + if( (prev_page_flt_rate < cur_mm->page_flt_rate) && + (((token_tree_leftmost != NULL) && (cur_mm->page_flt_rate > leftmost_mm->page_flt_rate)) || + (cur_swap_token_frac < max_swap_token_frac)) ) { + + printk(KERN_NOTICE "SGDEBUG: [pid:%u] Eligible for token\n", current->pid); + if(cur_mm->has_swap_token) { + /* update the position in the tree */ + token_tree_delete(cur_mm); + token_tree_insert(cur_mm); + } else { - if (likely(current->mm->token_priority > 0)) - current->mm->token_priority--; + /* give the token */ + cur_mm->has_swap_token = 1; + token_tree_insert(cur_mm); + num_swap_token_holders++; } - /* Check if we deserve the token */ - if (current->mm->token_priority > - swap_token_mm->token_priority) { - current->mm->token_priority += 2; - swap_token_mm = current->mm; + /* set the expiry time of the token */ + cur_mm->token_expiry_jiffies64 = get_jiffies_64() + + (HZ * vm_token_validity_period_ms) / 1000; + + /* re-calculate the fraction of token holders */ + cur_swap_token_frac = (num_swap_token_holders *100) / nr_running(); + + /* the leftmost_token_tree may get updated, so recompute leftmost_mm */ + leftmost_mm = container_of(token_tree_leftmost, struct mm_struct, token_tree_node); + + /* if max allowed fraction exceeded take the token from the leftmost */ + if(cur_swap_token_frac > max_swap_token_frac) { +// printk(KERN_NOTICE "SGDEBUG: (grab_swap_token) mm = %x\n", cur_mm); + __put_swap_token(leftmost_mm); } - } else { - /* Token holder came in again! */ - current->mm->token_priority += 2; } + /* if not eligible for page-faults and token has expired*/ + else if(cur_mm->has_swap_token && + time_after64(get_jiffies_64(), cur_mm->token_expiry_jiffies64) ) { +// printk(KERN_NOTICE "SGDEBUG: (grab_swap_token) mm = %x\n", cur_mm); + __put_swap_token(cur_mm); + } + cur_mm->last_logical_faultstamp = global_faults; + cur_mm->last_logical_interval = current_logical_interval; + cur_mm->last_interval_ns = current_interval_ns; + cur_mm->last_interval = current_interval; + spin_unlock(&(cur_mm->swap_token_lock)); + spin_unlock(&(swap_token_lock)); -out: - current->mm->faultstamp = global_faults; - current->mm->last_interval = current_interval; - spin_unlock(&swap_token_lock); -return; + return; } -/* Called on process exit. */ +/* Revokes the swap token */ + +/* Expects that the mm->swap_token_lock is held before it is called */ void __put_swap_token(struct mm_struct *mm) { - spin_lock(&swap_token_lock); - if (likely(mm == swap_token_mm)) - swap_token_mm = NULL; - spin_unlock(&swap_token_lock); +// printk(KERN_NOTICE "SGDEBUG: (put_swap_token) mm = %x\n", mm); +// spin_lock(&(mm->swap_token_lock)); + if (likely(mm->has_swap_token)) { + mm->has_swap_token = 0; + token_tree_delete(mm); + num_swap_token_holders--; + printk(KERN_NOTICE "SGDEBUG: Token revoked\n"); + } +// spin_unlock(&(mm->swap_token_lock)); } +