Re: task->mm remains null in 'context_switch()'

Sukanto Ghosh <sukanto.cse.iitb@xxxxxxxxx> · Mon, 11 May 2009 23:03:44 +0530

I have attached the diff


On Mon, May 11, 2009 at 8:16 PM, Mulyadi Santosa
<mulyadi.santosa@xxxxxxxxx> wrote:
> Hi...
>
> On Mon, May 11, 2009 at 4:22 PM, Sukanto Ghosh
> <sukanto.cse.iitb@xxxxxxxxx> wrote:
>> Hi,
>>
>> I was adding a field to the 3 trace_mark() calls in kernel/sched.c.
>> The calls are at: i) context_switch(prev, next, ... )  ii)
>> try_to_wake_up(p, ...)  iii) wake_up_new_task(p, ...)  functions
>>
>> The field is 'task'->mm->pfrate. (I have added the pfrate field in
>> mm_struct), where 'task' is a placeholder for prev/next/p/rq->curr in
>> those trace_mark() calls.  I found that always either the next/p
>> pointers are NULL or task->mm is NULL at that particular point.  Is it
>> supposed to be so ? Why ?
>>
>> PS: The trace_mark() in kernel/sched.c calls are for adding entries to
>> the trace-file of sched_switch tracer.
>
> Always? hmm strange. sure you did the deferencing to task_struct correctly?
>
> perhaps you could share to us the change you made? in a form of diff -u perhaps?
>
> regards,
>
> Mulyadi.
>



-- 
Regards,
Sukanto Ghosh
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -224,10 +225,25 @@ struct mm_struct {
 	 * it has been since this task got the token.
 	 * Look at mm/thrash.c
 	 */
-	unsigned int faultstamp;
-	unsigned int token_priority;
-	unsigned int last_interval;
+	u64 last_logical_faultstamp;    /* global faultstamp value at last major fault */
+	u64 last_interval;		/* in jiffies */
+	unsigned long long last_interval_ns;		/* in nanoseconds (for intervals < 1 jiffie) */
+	u64 last_logical_interval;	/* in global faults count */
+	unsigned int mm_maj_flts;
+	u64 page_flt_rate;		/* unit: (ns)^(-1)  */
+	u64 last_faultstamp_jiffies64;	/* jiffies of last major fault */
+	cycles_t last_faultstamp_cycles;	/* clock_value of last major fault */
+	u64 token_expiry_jiffies64;	/* expiry time of the swap_token */
+	u64 creation_jiffies64;		/* time at which the mm_struct was created */
+	
+	
+	/* boolean value: whether this process has a swap token or not */
+	unsigned int has_swap_token;		
+	
+	spinlock_t swap_token_lock;
+	struct rb_node token_tree_node;
 
+	unsigned int token_priority;
 	unsigned long flags; /* Must use atomic bitops to access the bits */
 
 	struct core_state *core_state; /* coredumping support */

diff --git a/include/linux/swap.h b/include/linux/swap.h
index de40f16..d742a0e 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -254,24 +254,28 @@ extern int remove_exclusive_swap_page(struct page *);
 struct backing_dev_info;
 
 /* linux/mm/thrash.c */
+extern int honor_swap_token;
 extern struct mm_struct * swap_token_mm;
-extern void grab_swap_token(void);
+extern void grab_swap_token(struct mm_struct *);
 extern void __put_swap_token(struct mm_struct *);
 
 static inline int has_swap_token(struct mm_struct *mm)
 {
-	return (mm == swap_token_mm);
+	return ((mm->has_swap_token)==1);
 }
 
 static inline void put_swap_token(struct mm_struct *mm)
 {
+	spin_lock(&(mm->swap_token_lock));
 	if (has_swap_token(mm))
 		__put_swap_token(mm);
+	spin_unlock(&(mm->swap_token_lock));
 }
 
 static inline void disable_swap_token(void)
 {
-	put_swap_token(swap_token_mm);
+//	put_swap_token(swap_token_mm);
+//	honor_swap_token = 0;	
 }
 
 #else /* CONFIG_SWAP */
diff --git a/kernel/fork.c b/kernel/fork.c
index 7ce2ebe..8aed6dd 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -567,8 +567,19 @@ struct mm_struct *dup_mm(struct task_struct *tsk)
 	memcpy(mm, oldmm, sizeof(*mm));
 
 	/* Initializing for Swap token stuff */
-	mm->token_priority = 0;
+//	mm->token_priority = 0;
 	mm->last_interval = 0;
+	mm->last_logical_interval = 0;
+	mm->has_swap_token = 0;
+	mm->last_logical_faultstamp = 0;
+	mm->mm_maj_flts = 0;
+	mm->page_flt_rate = 0;
+	mm->creation_jiffies64 = get_jiffies_64();
+	mm->last_faultstamp_jiffies64 = mm->creation_jiffies64;
+	mm->last_faultstamp_cycles = get_cycles();
+	mm->last_interval_ns = 0;
+
+	spin_lock_init(&(mm->swap_token_lock));
 
 	if (!mm_init(mm, tsk))
 		goto fail_nomem;
diff --git a/kernel/sched.c b/kernel/sched.c
index ad1962d..00731d4 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2283,8 +2283,8 @@ out_activate:
 
 out_running:
 	trace_mark(kernel_sched_wakeup,
-		"pid %d state %ld ## rq %p task %p rq->curr %p",
-		p->pid, p->state, rq, p, rq->curr);
+		"pid %d state %ld ## rq %p task %p rq->curr %p rq_load %lu page_flt_rate %llu",
+		p->pid, p->state, rq, p, rq->curr, rq->load.weight, p->mm->page_flt_rate);
 	check_preempt_curr(rq, p);
 
 	p->state = TASK_RUNNING;
@@ -2418,8 +2418,8 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 		inc_nr_running(rq);
 	}
 	trace_mark(kernel_sched_wakeup_new,
-		"pid %d state %ld ## rq %p task %p rq->curr %p",
-		p->pid, p->state, rq, p, rq->curr);
+		"pid %d state %ld ## rq %p task %p rq->curr %p rq_load %lu page_flt_rate %llu",
+		p->pid, p->state, rq, p, rq->curr, rq->load.weight, p->mm->page_flt_rate);
 	check_preempt_curr(rq, p);
 #ifdef CONFIG_SMP
 	if (p->sched_class->task_wake_up)
@@ -2594,9 +2594,9 @@ context_switch(struct rq *rq, struct task_struct *prev,
 	prepare_task_switch(rq, prev, next);
 	trace_mark(kernel_sched_schedule,
 		"prev_pid %d next_pid %d prev_state %ld "
-		"## rq %p prev %p next %p",
+		"## rq %p prev %p next %p rq_load %lu page_flt_rate %llu",
 		prev->pid, next->pid, prev->state,
-		rq, prev, next);
+		rq, prev, next, rq->load.weight, next->mm->page_flt_rate);
 	mm = next->mm;
 	oldmm = prev->active_mm;
 	/*
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 50ec088..e187896 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -83,6 +83,9 @@ extern int compat_log;
 extern int maps_protect;
 extern int latencytop_enabled;
 extern int sysctl_nr_open_min, sysctl_nr_open_max;
+/* Swap-token related */
+extern int vm_token_validity_period_ms; 
+extern int max_swap_token_frac;
 #ifdef CONFIG_RCU_TORTURE_TEST
 extern int rcutorture_runnable;
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
@@ -855,6 +858,23 @@ static struct ctl_table kern_table[] = {
 };
 
 static struct ctl_table vm_table[] = {
+	/* swap token memory management controls */
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "swap_token_validity_ms",
+		.data		= &vm_token_validity_period_ms,
+		.maxlen		= sizeof(vm_token_validity_period_ms),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "max_swap_token_holder_frac",
+		.data		= &max_swap_token_frac,
+		.maxlen		= sizeof(max_swap_token_frac),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec
+	},
 	{
 		.ctl_name	= VM_OVERCOMMIT_MEMORY,
 		.procname	= "overcommit_memory",
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8f3fb3d..2311486 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -948,7 +948,9 @@ tracing_sched_switch_trace(struct trace_array *tr,
 			   struct trace_array_cpu *data,
 			   struct task_struct *prev,
 			   struct task_struct *next,
-			   unsigned long flags)
+			   unsigned long flags,
+			   unsigned long rq_load, 
+			   u64 page_flt_rate)
 {
 	struct trace_entry *entry;
 	unsigned long irq_flags;
@@ -964,6 +966,8 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->ctx.next_pid	= next->pid;
 	entry->ctx.next_prio	= next->prio;
 	entry->ctx.next_state	= next->state;
+	entry->ctx.rq_load	= rq_load;
+	entry->ctx.page_flt_rate = page_flt_rate;
 	__trace_stack(tr, data, flags, 5);
 	__raw_spin_unlock(&data->lock);
 	raw_local_irq_restore(irq_flags);
@@ -974,7 +978,9 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 			   struct trace_array_cpu *data,
 			   struct task_struct *wakee,
 			   struct task_struct *curr,
-			   unsigned long flags)
+			   unsigned long flags,
+			   unsigned long rq_load,
+			   u64 page_flt_rate)
 {
 	struct trace_entry *entry;
 	unsigned long irq_flags;
@@ -990,6 +996,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->ctx.next_pid	= wakee->pid;
 	entry->ctx.next_prio	= wakee->prio;
 	entry->ctx.next_state	= wakee->state;
+	entry->ctx.rq_load	= rq_load;
+	entry->ctx.page_flt_rate = page_flt_rate;
 	__trace_stack(tr, data, flags, 6);
 	__raw_spin_unlock(&data->lock);
 	raw_local_irq_restore(irq_flags);
@@ -1524,13 +1532,16 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 		state = entry->ctx.prev_state ? __ffs(entry->ctx.prev_state) + 1 : 0;
 		S = state < sizeof(state_to_char) - 1 ? state_to_char[state] : 'X';
 		comm = trace_find_cmdline(entry->ctx.next_pid);
-		trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %s\n",
+		trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %lu %llu %s\n",
 				 entry->ctx.prev_pid,
 				 entry->ctx.prev_prio,
 				 S, entry->type == TRACE_CTX ? "==>" : "  +",
 				 entry->ctx.next_pid,
 				 entry->ctx.next_prio,
-				 T, comm);
+				 T,
+				 entry->ctx.rq_load,
+				 entry->ctx.page_flt_rate,
+				 comm);
 		break;
 	case TRACE_SPECIAL:
 		trace_seq_printf(s, "# %ld %ld %ld\n",
@@ -1611,14 +1622,16 @@ static int print_trace_fmt(struct trace_iterator *iter)
 			state_to_char[entry->ctx.prev_state] : 'X';
 		T = entry->ctx.next_state < sizeof(state_to_char) ?
 			state_to_char[entry->ctx.next_state] : 'X';
-		ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c\n",
+		ret = trace_seq_printf(s, " %5d:%3d:%c %s %5d:%3d:%c %lu %llu\n",
 				       entry->ctx.prev_pid,
 				       entry->ctx.prev_prio,
 				       S,
 				       entry->type == TRACE_CTX ? "==>" : "  +",
 				       entry->ctx.next_pid,
 				       entry->ctx.next_prio,
-				       T);
+				       T,
+				       entry->ctx.rq_load,
+				       entry->ctx.page_flt_rate);
 		if (!ret)
 			return 0;
 		break;
@@ -1679,13 +1692,15 @@ static int print_raw_fmt(struct trace_iterator *iter)
 			state_to_char[entry->ctx.next_state] : 'X';
 		if (entry->type == TRACE_WAKE)
 			S = '+';
-		ret = trace_seq_printf(s, "%d %d %c %d %d %c\n",
+		ret = trace_seq_printf(s, "%d %d %c %d %d %c %lu %llu\n",
 				       entry->ctx.prev_pid,
 				       entry->ctx.prev_prio,
 				       S,
 				       entry->ctx.next_pid,
 				       entry->ctx.next_prio,
-				       T);
+				       T,
+				       entry->ctx.rq_load,
+				       entry->ctx.page_flt_rate);
 		if (!ret)
 			return 0;
 		break;
@@ -1783,6 +1798,8 @@ static int print_bin_fmt(struct trace_iterator *iter)
 		SEQ_PUT_FIELD_RET(s, entry->ctx.next_pid);
 		SEQ_PUT_FIELD_RET(s, entry->ctx.next_prio);
 		SEQ_PUT_FIELD_RET(s, entry->ctx.next_state);
+		SEQ_PUT_FIELD_RET(s, entry->ctx.rq_load);
+		SEQ_PUT_FIELD_RET(s, entry->ctx.page_flt_rate);
 		break;
 	case TRACE_SPECIAL:
 	case TRACE_STACK:
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f69f867..aafa40e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -39,6 +39,8 @@ struct ctx_switch_entry {
 	unsigned int		next_pid;
 	unsigned char		next_prio;
 	unsigned char		next_state;
+	unsigned long		rq_load;
+	u64			page_flt_rate;
 };
 
 /*
@@ -204,14 +206,18 @@ void tracing_sched_switch_trace(struct trace_array *tr,
 				struct trace_array_cpu *data,
 				struct task_struct *prev,
 				struct task_struct *next,
-				unsigned long flags);
+				unsigned long flags,
+				unsigned long rq_load,
+				u64 page_flt_rate);
 void tracing_record_cmdline(struct task_struct *tsk);
 
 void tracing_sched_wakeup_trace(struct trace_array *tr,
 				struct trace_array_cpu *data,
 				struct task_struct *wakee,
 				struct task_struct *cur,
-				unsigned long flags);
+				unsigned long flags,
+				unsigned long rq_load,
+				u64 page_flt_rate);
 void trace_special(struct trace_array *tr,
 		   struct trace_array_cpu *data,
 		   unsigned long arg1,
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index cb817a2..93b0412 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -20,7 +20,7 @@ static atomic_t			sched_ref;
 
 static void
 sched_switch_func(void *private, void *__rq, struct task_struct *prev,
-			struct task_struct *next)
+			struct task_struct *next, unsigned long rq_load, u64 page_flt_rate)
 {
 	struct trace_array **ptr = private;
 	struct trace_array *tr = *ptr;
@@ -40,8 +40,10 @@ sched_switch_func(void *private, void *__rq, struct task_struct *prev,
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
 
+//	rq_load = ((struct rq*)__rq)->load.weight;
+
 	if (likely(disabled == 1))
-		tracing_sched_switch_trace(tr, data, prev, next, flags);
+		tracing_sched_switch_trace(tr, data, prev, next, flags, rq_load, page_flt_rate);
 
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
@@ -54,6 +56,8 @@ sched_switch_callback(void *probe_data, void *call_data,
 	struct task_struct *prev;
 	struct task_struct *next;
 	struct rq *__rq;
+	unsigned long rq_load;
+	u64 page_flt_rate;
 
 	if (!atomic_read(&sched_ref))
 		return;
@@ -65,17 +69,19 @@ sched_switch_callback(void *probe_data, void *call_data,
 	__rq = va_arg(*args, typeof(__rq));
 	prev = va_arg(*args, typeof(prev));
 	next = va_arg(*args, typeof(next));
+	rq_load = va_arg(*args, typeof(rq_load));
+	page_flt_rate = va_arg(*args, typeof(page_flt_rate));
 
 	/*
 	 * If tracer_switch_func only points to the local
 	 * switch func, it still needs the ptr passed to it.
 	 */
-	sched_switch_func(probe_data, __rq, prev, next);
+	sched_switch_func(probe_data, __rq, prev, next, rq_load, page_flt_rate);
 }
 
 static void
 wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct
-			task_struct *curr)
+			task_struct *curr, unsigned long rq_load, u64 page_flt_rate)
 {
 	struct trace_array **ptr = private;
 	struct trace_array *tr = *ptr;
@@ -94,8 +100,10 @@ wakeup_func(void *private, void *__rq, struct task_struct *wakee, struct
 	data = tr->data[cpu];
 	disabled = atomic_inc_return(&data->disabled);
 
+//	rq_load = ((struct rq*)__rq)->load.weight;
+
 	if (likely(disabled == 1))
-		tracing_sched_wakeup_trace(tr, data, wakee, curr, flags);
+		tracing_sched_wakeup_trace(tr, data, wakee, curr, flags, rq_load, page_flt_rate);
 
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
@@ -108,6 +116,8 @@ wake_up_callback(void *probe_data, void *call_data,
 	struct task_struct *curr;
 	struct task_struct *task;
 	struct rq *__rq;
+	unsigned long rq_load;
+	u64 page_flt_rate;
 
 	if (likely(!tracer_enabled))
 		return;
@@ -119,11 +129,13 @@ wake_up_callback(void *probe_data, void *call_data,
 	__rq = va_arg(*args, typeof(__rq));
 	task = va_arg(*args, typeof(task));
 	curr = va_arg(*args, typeof(curr));
+	rq_load = va_arg(*args, typeof(rq_load));
+	page_flt_rate = va_arg(*args, typeof(page_flt_rate));
 
 	tracing_record_cmdline(task);
 	tracing_record_cmdline(curr);
 
-	wakeup_func(probe_data, __rq, task, curr);
+	wakeup_func(probe_data, __rq, task, curr, rq_load, page_flt_rate);
 }
 
 static void sched_switch_reset(struct trace_array *tr)
@@ -141,7 +153,7 @@ static int tracing_sched_register(void)
 	int ret;
 
 	ret = marker_probe_register("kernel_sched_wakeup",
-			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			"pid %d state %ld ## rq %p task %p rq->curr %p rq_load %lu page_flt_rate %llu",
 			wake_up_callback,
 			&ctx_trace);
 	if (ret) {
@@ -151,7 +163,7 @@ static int tracing_sched_register(void)
 	}
 
 	ret = marker_probe_register("kernel_sched_wakeup_new",
-			"pid %d state %ld ## rq %p task %p rq->curr %p",
+			"pid %d state %ld ## rq %p task %p rq->curr %p rq_load %lu page_flt_rate %llu",
 			wake_up_callback,
 			&ctx_trace);
 	if (ret) {
@@ -162,7 +174,7 @@ static int tracing_sched_register(void)
 
 	ret = marker_probe_register("kernel_sched_schedule",
 		"prev_pid %d next_pid %d prev_state %ld "
-		"## rq %p prev %p next %p",
+		"## rq %p prev %p next %p rq_load %lu page_flt_rate %llu",
 		sched_switch_callback,
 		&ctx_trace);
 	if (ret) {
diff --git a/mm/memory.c b/mm/memory.c
index 1002f47..ccc6eee 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2268,7 +2268,9 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	delayacct_set_flag(DELAYACCT_PF_SWAPIN);
 	page = lookup_swap_cache(entry);
 	if (!page) {
-		grab_swap_token(); /* Contend for token _before_ read-in */
+		/* Contend for token _before_ read-in */
+		if(honor_swap_token)
+			grab_swap_token(mm); 
 		page = swapin_readahead(entry,
 					GFP_HIGHUSER_MOVABLE, vma, address);
 		if (!page) {
diff --git a/mm/rmap.c b/mm/rmap.c
index e8d639b..8fb0ed5 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -317,11 +317,16 @@ static int page_referenced_one(struct page *page,
 	} else if (ptep_clear_flush_young_notify(vma, address, pte))
 		referenced++;
 
-	/* Pretend the page is referenced if the task has the
-	   swap token and is in the middle of a page fault. */
-	if (mm != current->mm && has_swap_token(mm) &&
-			rwsem_is_locked(&mm->mmap_sem))
-		referenced++;
+	/* If the task has an invalid swap token, revoke it. 
+	   Otherwise, pretend the page is referenced */
+	spin_lock(&(mm->swap_token_lock));
+	if ((mm != current->mm) && has_swap_token(mm)) {
+		if(time_after64(get_jiffies_64(), mm->token_expiry_jiffies64))
+			__put_swap_token(mm);
+		else
+			referenced++;
+	}
+	spin_unlock(&(mm->swap_token_lock));
 
 	(*mapcount)--;
 	pte_unmap_unlock(pte, ptl);
diff --git a/mm/thrash.c b/mm/thrash.c
index c4c5205..222ce54 100644
--- a/mm/thrash.c
+++ b/mm/thrash.c
@@ -21,59 +21,244 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/swap.h>
+#include <linux/rbtree.h>
+#include <linux/timex.h>
+#include <linux/cpufreq.h>
+#include <linux/clocksource.h>
 
 static DEFINE_SPINLOCK(swap_token_lock);
 struct mm_struct *swap_token_mm;
 static unsigned int global_faults;
 
-void grab_swap_token(void)
+int honor_swap_token = 1;
+
+struct rb_root token_tree_root = RB_ROOT;
+struct rb_node *token_tree_leftmost = NULL;
+int num_swap_token_holders = 0;
+
+/* maximum allowed fraction of swap-token holders */
+int max_swap_token_frac = 50;
+/* number of msec for which the token is valid */
+int vm_token_validity_period_ms = 10;
+
+
+void token_tree_insert(struct mm_struct *mm) {
+
+	struct rb_node **new = &(token_tree_root.rb_node), *parent = NULL;
+	int leftmost = 1;
+
+	while(*new) {
+		struct mm_struct *this = container_of(*new, struct mm_struct, token_tree_node);
+
+		parent = *new;
+		if(mm->page_flt_rate < this->page_flt_rate)
+			new = &((*new)->rb_left);
+//		else if(!((mm->page_flt_rate == this->page_flt_rate) && (mm == this))) {
+		else if(mm != this) {
+			new = &((*new)->rb_right);
+			leftmost = 0;
+		}
+		else
+			return;
+	}
+
+	if(leftmost)
+		token_tree_leftmost = &(mm->token_tree_node);
+
+	rb_link_node(&(mm->token_tree_node), parent, new);
+	rb_insert_color(&(mm->token_tree_node), &token_tree_root);
+}
+
+void token_tree_delete(struct mm_struct* mm) {
+
+	if(&(mm->token_tree_node) == token_tree_leftmost) {
+		struct rb_node* next_node = rb_next(token_tree_leftmost);
+		token_tree_leftmost = next_node;
+	}
+	rb_erase(&(mm->token_tree_node), &token_tree_root);
+}
+/* 64-bit division [copied from x86/include/div64.h]
+ *
+ * modifies the dividend to store the quotient */
+#define my_do_div64(n, base)                                         \
+	({                                                              \
+	         unsigned long __upper, __low, __high, __mod, __base;    \
+	         __base = (base);                                        \
+	         asm("":"=a" (__low), "=d" (__high) : "A" (n));          \
+	         __upper = __high;                                       \
+	         if (__high) {                                           \
+	                 __upper = __high % (__base);                    \
+	                 __high = __high / (__base);                     \
+	         }                                                       \
+	         asm("divl %2":"=a" (__low), "=d" (__mod)                \
+			             : "rm" (__base), "0" (__low), "1" (__upper));       \
+	         asm("":"=A" (n) : "a" (__low), "d" (__high));           \
+	         __mod;                                                  \
+	 })
+
+#define sg_jiffies_2_ns(x)  (x * 1000000LLU  * (1000/HZ))
+
+/*TODO: to add clock_t get_cycles(void)  for timing measurements when time elapsed 
+ * between successive page faults is < 1 jiffies		
+ *
+ * use 'unsigned int cpufreq_quick_get(unsigned int cpu)' for getting the cpu freq
+ * in KHz unit
+ *
+ * conversion from number of clocks to nanosecond:
+ * 
+ * x clocks  = ((x * 10^6)/cpufreq) ns
+ *
+ */
+
+void grab_swap_token(struct mm_struct *cur_mm)
 {
-	int current_interval;
+//	struct mm_struct *cur_mm;
+	struct mm_struct *leftmost_mm = NULL;
+	int current_logical_interval;
+	u64 current_interval;
+	unsigned long long current_interval_ns = 0;
+	int cur_swap_token_frac;
+
+	int nr_total;
+//	u64 duration;	
+	u64 a,b,c,d,e,f;
+	u64 prev_page_flt_rate;
+
+//	cur_mm = current->mm;
 
+
+	// global lock
+	spin_lock(&(swap_token_lock));
+	
 	global_faults++;
+	current_logical_interval = global_faults - cur_mm->last_logical_faultstamp;
+
+	spin_lock(&(cur_mm->swap_token_lock));
+
+	cur_mm->mm_maj_flts++;
+	/* calculate the page fault rate */
+
+	current_interval = (get_jiffies_64() -  cur_mm->last_faultstamp_jiffies64);
+	if(current_interval == 0LLU) {	/* interval less than 1 jiffie */
+		current_interval_ns = cyc2ns(clock, (long)get_cycles() - (long)cur_mm->last_faultstamp_cycles); 
+	}
+	
+	cur_mm->last_faultstamp_jiffies64 = get_jiffies_64();
+	cur_mm->last_faultstamp_cycles = get_cycles();
+
+	prev_page_flt_rate = cur_mm->page_flt_rate;
+
+#define MULTIPLIER 1000000LLU
+	if(cur_mm->last_interval == 0) {
+		cur_mm->page_flt_rate = MULTIPLIER;
+		if(current_interval) {
+			a = sg_jiffies_2_ns(current_interval);
+			my_do_div64(cur_mm->page_flt_rate, a);
+		}
+		else 
+			my_do_div64(cur_mm->page_flt_rate, current_interval_ns);
+		
+	}
+	else {
+		a = 25LLU*cur_mm->page_flt_rate;
+		b = 100LLU;
+		c = 35*MULTIPLIER;
 
-	current_interval = global_faults - current->mm->faultstamp;
+		if(cur_mm->last_interval)
+			d = 100LLU * sg_jiffies_2_ns(cur_mm->last_interval);
+		else
+			d = 100LLU * cur_mm->last_interval_ns;
 
-	if (!spin_trylock(&swap_token_lock))
-		return;
+		e = 40*MULTIPLIER;
 
-	/* First come first served */
-	if (swap_token_mm == NULL) {
-		current->mm->token_priority = current->mm->token_priority + 2;
-		swap_token_mm = current->mm;
-		goto out;
+		if(current_interval)
+			f = 100LLU * sg_jiffies_2_ns(current_interval);
+		else
+			f = 100LLU * current_interval_ns;
+			
+		printk(KERN_NOTICE "SGDEBUG: f becomes %llu\n", f);
+		my_do_div64(a,b);
+		my_do_div64(c,d);
+		my_do_div64(e,f);
+		cur_mm->page_flt_rate = (a+c+e);
 	}
+#undef MULTIPLIER
 
-	if (current->mm != swap_token_mm) {
-		if (current_interval < current->mm->last_interval)
-			current->mm->token_priority++;
+	
+	printk(KERN_NOTICE "SGDEBUG: [pid:%u] # page-faults: %u; current_interval %llu; pg-flt rate: %llu \n", current->pid, cur_mm->mm_maj_flts, current_interval, cur_mm->page_flt_rate); 
+
+	/* calculate the current fraction of token_holders */
+	nr_total = nr_running();
+	cur_swap_token_frac = (num_swap_token_holders *100) / nr_total;
+
+	printk(KERN_NOTICE "SGDEBUG: [pid:%u] Total proc: %d ; # Holders: %d ; Holder-frac: %d \n", current->pid, nr_total, num_swap_token_holders, cur_swap_token_frac); 
+
+	/* the leftmost node in the rb-tree, with the least page-fault rate */ 
+	if( token_tree_leftmost != NULL)
+		leftmost_mm = container_of(token_tree_leftmost, struct mm_struct, token_tree_node);
+	
+	/* check if 'eligible' to get the token */
+	if( (prev_page_flt_rate < cur_mm->page_flt_rate) && 
+		(((token_tree_leftmost != NULL) &&  (cur_mm->page_flt_rate > leftmost_mm->page_flt_rate)) || 
+		 (cur_swap_token_frac < max_swap_token_frac)) )  {
+
+		printk(KERN_NOTICE "SGDEBUG: [pid:%u] Eligible for token\n", current->pid); 
+		if(cur_mm->has_swap_token) {
+			/* update the position in the tree */
+			token_tree_delete(cur_mm);
+			token_tree_insert(cur_mm);
+		}
 		else {
-			if (likely(current->mm->token_priority > 0))
-				current->mm->token_priority--;
+			/* give the token */
+			cur_mm->has_swap_token = 1;
+			token_tree_insert(cur_mm);
+			num_swap_token_holders++;
 		}
-		/* Check if we deserve the token */
-		if (current->mm->token_priority >
-				swap_token_mm->token_priority) {
-			current->mm->token_priority += 2;
-			swap_token_mm = current->mm;
+		/* set the expiry time of the token */
+		cur_mm->token_expiry_jiffies64 = get_jiffies_64() + 
+			(HZ * vm_token_validity_period_ms) / 1000;
+
+		/* re-calculate the fraction of token holders */
+		cur_swap_token_frac = (num_swap_token_holders *100) / nr_running();
+		
+		/* the leftmost_token_tree may get updated, so recompute leftmost_mm  */
+		leftmost_mm = container_of(token_tree_leftmost, struct mm_struct, token_tree_node);
+
+		/* if max allowed fraction exceeded take the token from the leftmost */
+		if(cur_swap_token_frac > max_swap_token_frac) {
+//			printk(KERN_NOTICE "SGDEBUG: (grab_swap_token) mm = %x\n", cur_mm); 
+			__put_swap_token(leftmost_mm);
 		}
-	} else {
-		/* Token holder came in again! */
-		current->mm->token_priority += 2;
 	}
+	/* if not eligible for page-faults and token has expired*/
+	else if(cur_mm->has_swap_token && 
+			time_after64(get_jiffies_64(), cur_mm->token_expiry_jiffies64) ) {		
+//		printk(KERN_NOTICE "SGDEBUG: (grab_swap_token) mm = %x\n", cur_mm); 
+		__put_swap_token(cur_mm);
+	}
+	cur_mm->last_logical_faultstamp = global_faults;
+	cur_mm->last_logical_interval = current_logical_interval;
+	cur_mm->last_interval_ns = current_interval_ns;
+	cur_mm->last_interval = current_interval;
+	spin_unlock(&(cur_mm->swap_token_lock));
+	spin_unlock(&(swap_token_lock));
 
-out:
-	current->mm->faultstamp = global_faults;
-	current->mm->last_interval = current_interval;
-	spin_unlock(&swap_token_lock);
-return;
+	return;
 }
 
-/* Called on process exit. */
+/* Revokes the swap token */
+
+/* Expects that the mm->swap_token_lock is held before it is called */
 void __put_swap_token(struct mm_struct *mm)
 {
-	spin_lock(&swap_token_lock);
-	if (likely(mm == swap_token_mm))
-		swap_token_mm = NULL;
-	spin_unlock(&swap_token_lock);
+//	printk(KERN_NOTICE "SGDEBUG: (put_swap_token) mm = %x\n", mm); 
+//	spin_lock(&(mm->swap_token_lock));
+	if (likely(mm->has_swap_token)) {
+		mm->has_swap_token = 0;
+		token_tree_delete(mm);
+		num_swap_token_holders--;
+		printk(KERN_NOTICE "SGDEBUG: Token revoked\n"); 
+	}
+//	spin_unlock(&(mm->swap_token_lock));
 }
+