how to measure total time spent in waiting for mutex without cswitch ?

Navin Parakkal <navinp1912@xxxxxxxxx> · Sat, 27 Feb 2016 11:02:49 +0530

Hi,

I have a program with gcc/g++ .

gcc version 5.2.1 20151010 (Ubuntu 5.2.1-22ubuntu2)  on Ubuntu 15.10.

The entire code is posted below with compiler options.

Each function is of the following:

double ax[7];
const long long MYNANO=1e9;

uint64_t ispM1(const uint64_t &K,map<uint64_t,uint64_t> &M){

struct timespec t1;
int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

lock_guard<mutex> guard(mut_M1);

ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

ax[0]+=(after-before)*1./MYNANO;

/* do something*/,
return ;
}

What i think is happening is when you have 256 or 512 threads , all
the threads have before set at values like 1,2,3 ...512 . And then
then try to get the mutex(mut_M1), only 1 gets it , rest all waits for
it and then most of them context switches (voluntary and involuntary)
. Now after all that one of them gets the mutex and since each thread
takes max 90 secs for getting the mutex, you have after = 90. So you
have 512*100 ie around 51200 max waiting for M1. Same for after.

Now this works out well with values
M1          -> 18636.46859

The M1 is the sum of times all threads waited on mutex M1.

1) Is there any other reason ?

2) How do i get more accurate values ? ie i want to subtract the total
sum of time taken by all threads correspoing to mut_M1 done in context
switching ? Does gcc/linux provide anything for that ?  i see
/proc/<pid>/task/<tid>status provides only the number of cswtch
voluntary and involuntary.There is no time per thread.

3) If i run a cpu with noHZ and run this only task on that particular
CPU with nohz_full=4 with kernel command line and then scheduling with
taskset -c 4 ./a.out help or it is also going to suffer from the same
problem ?

4) Everytime just after a context switch if the function is not inside
mutex but just after clock_gettime and before acquire the mutex, if
there is a context switch, after returning from the context switch ,
can we make it call clock_gettime again() , any signal or event knows
us to help that to update the value before ? like i want a function to
return the starting time and the resume/end time of all cswitch (voln
or involuntary) per thread between 2 measurement lines like L1 and L2.
and so that i can deduct that .

Even though the wall clock time is 1 minute 15 sec ie less than 100
seconds and i have 4 cpu's , the total time shouldn't be more than 400
or 800 (hyperthreading if enabled) seconds max of wall clock time. So
upper limit of 1000 seconds of wall clock time is what my CPU can run.

But due to context switch (voluntary and involuntary) i get around
18636 seconds as time lost in waiting for mutex.

/usr/bin/time -v ./a.out
M1          -> 18636.46859
M2          -> 48.38547
STK         -> 3.60696
CACHE READ  -> 0.00502
CACHE WRITE -> 0.00291
Command being timed: "./a.out"
User time (seconds): 44.47
System time (seconds): 152.42
Percent of CPU this job got: 260%
Elapsed (wall clock) time (h:mm:ss or m:ss): 1:15.72
Average shared text size (kbytes): 0
Average unshared data size (kbytes): 0
Average stack size (kbytes): 0
Average total size (kbytes): 0
Maximum resident set size (kbytes): 41020
Average resident set size (kbytes): 0
Major (requiring I/O) page faults: 0
Minor (reclaiming a frame) page faults: 5666
Voluntary context switches: 19084080
Involuntary context switches: 937370
Swaps: 0
File system inputs: 0
File system outputs: 0
Socket messages sent: 0
Socket messages received: 0
Signals delivered: 0
Page size (bytes): 4096
Exit status: 0

g++ -O2 -pthread -std=c++11 P1xx.cpp

#include<iostream>
#include<cstdio>
#include<cstdint>
#include<map>
#include<stack>
#include<cassert>
#include<algorithm>
#include<atomic>
#include<mutex>
#include<thread>
#include<ctime>
using namespace std;
const char delim=',';

const int PARALLEL_TASKS=256;

const int VALUES_LIM=PARALLEL_TASKS;
const int BOUND_LIM=VALUES_LIM*2;

typedef pair<int,pair<uint64_t,uint64_t> > TUP;

mutex mut_M1,mut_M2,mut_STK,mut_cache_lock,mut_write_lock;
double ax[7];
const long long MYNANO=1e9;

void dummy(void){}

uint64_t ispM1(const uint64_t &K,map<uint64_t,uint64_t> &M){

struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

lock_guard<mutex> guard(mut_M1);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

ax[0]+=(after-before)*1./MYNANO;

if(M.find(K)==M.end()) return 0;
return M[K];
}

uint64_t ispM2(const uint64_t &K,map<uint64_t,uint64_t> &M){

struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

lock_guard<mutex> guard(mut_M2);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

ax[1]+=(after-before)*1./MYNANO;

if(M.find(K)==M.end()) return 0;
return M[K];
}

uint64_t ispcache(const string &K,map<string,uint64_t> &M){

struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

lock_guard<mutex> guard(mut_write_lock);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

ax[4]+=(after - before)*1./MYNANO;

if(M.find(K)==M.end()) return 0;
return M[K];
}

void setvalcache(const string K,uint64_t &V,map<string,uint64_t>
&M,map<uint64_t,string> &RM){

struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

lock_guard<mutex>  guard(mut_write_lock);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

ax[4]+=(after-before)*1./MYNANO;

uint64_t tmpx(V);
M[K]=tmpx;
RM[tmpx]=K;
}

uint64_t set_cacheval(const string str,map<string,uint64_t> &M,
uint64_t &idx,map<uint64_t,string> &RM)
{
struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;
lock_guard<mutex>  guard(mut_cache_lock);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

ax[3]+=(after-before)*1./MYNANO;

uint64_t val=ispcache(str,M);
if(!val)
{
setvalcache(str,idx,M,RM);
val=idx;
idx++;
}
return val;

}

void deleteval(const uint64_t &K,map<uint64_t,uint64_t> &M)
{
struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

lock_guard<mutex> guard(mut_M2);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

ax[1]+=(after-before)*1./MYNANO;

uint64_t val=M[K];
if(val==1){
M.erase(K);
} else {
val--;
M[K]=val;
}
}

void updateval(const uint64_t &K,map<uint64_t,uint64_t> &M)
{
struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

lock_guard<mutex> guard(mut_M2);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

ax[1]+=(after-before)*1./MYNANO;
M[K]++;
}

void setval(const uint64_t &K,const uint64_t &V,map<uint64_t,uint64_t>
&M1,map<uint64_t,uint64_t> &M2,stack<TUP> &STK,bool logtrans=true)
{

uint64_t oldval(0);
oldval=ispM1(K,M1);

struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

lock_guard<mutex> guard_M1(mut_M1);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

ax[0]+=(after-before)*1./MYNANO;

if(oldval==V) {
return;
}

M1[K]=V;
updateval(V,M2);
deleteval(oldval,M2);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

lock_guard<mutex> guardstk(mut_STK);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

ax[2]+=(after-before)*1./MYNANO;

if(logtrans && STK.size()){
STK.push(pair<int,pair<uint64_t,uint64_t> >(3, pair<uint64_t,uint64_t>
(K,oldval)));
STK.push(pair<int,pair<uint64_t,uint64_t> >(3, pair<uint64_t,uint64_t> (K,V)));
}

return;
}

uint64_t getval(const uint64_t &K, const map<uint64_t,uint64_t> &M1)
{

struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

lock_guard<mutex> guard(mut_M1);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

ax[0]+=(after-before)*1./MYNANO;
if(M1.count(K)) return M1.at(K);
return 0;
}

void doerase(const uint64_t &K,map<uint64_t,uint64_t>
&M1,map<uint64_t,uint64_t> &M2,stack<TUP> &STK,bool logtrans=true)
{
struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;
lock_guard<mutex> guard(mut_M1);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;
ax[0]+=(after-before)*1./MYNANO;

if(M1.find(K)!=M1.end()){
uint64_t val=M1[K];
deleteval(val,M2);
M1.erase(K);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
before =(t1.tv_sec*MYNANO) + t1.tv_nsec;
lock_guard<mutex> guard(mut_STK);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

ax[2]+=(after-before)*1./MYNANO;

if(logtrans && STK.size()){
STK.push(pair<int,pair<uint64_t,uint64_t> >(4,pair<uint64_t,uint64_t>(K,val)));
}
}
return;
}

uint64_t getcount(const uint64_t &V,const map<uint64_t,uint64_t> &M2)
{
struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

lock_guard<mutex> guard(mut_M2);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

ax[1]+=(after-before)*1./MYNANO;

if(M2.count(V)) return M2.at(V);
return 0;
}

void dostart(stack<TUP> &STK)
{
struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;
lock_guard<mutex> guard(mut_STK);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

ax[2]+=(after-before)*1./MYNANO;

STK.push(pair<int,pair<uint64_t,uint64_t> > (0,pair<uint64_t,uint64_t>(0,0)));

}

void docommit(stack<TUP> &STK)
{
struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

lock_guard<mutex> guard(mut_STK);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;
ax[2]+=(after-before)*1./MYNANO;
while(STK.size()){
TUP t=STK.top();
STK.pop();
if(t.first==0) break;
}
}

void dorollback(stack<TUP> &STK ,map<uint64_t,uint64_t>
&M1,map<uint64_t,uint64_t> &M2)
{
struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

lock_guard<mutex> guard(mut_STK);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;
ax[2]+=(after-before)*1./MYNANO;

while(STK.size()){
TUP t=STK.top();
STK.pop();
if(t.first==3){

TUP tprev=STK.top();
STK.pop();

assert(t.first==tprev.first);
if(tprev.second.second==0){
doerase(t.second.first,M1,M2,STK,false);
} else {
setval(t.second.first,tprev.second.second,M1,M2,STK,false);
}
} else if(t.first==4){ // Rolling back an erase
setval(t.second.first,t.second.second,M1,M2,STK,false);
}
if(t.first==0) break;
}

}

/* Test stubs to check multithreading */

// Checks if getval for key value pair works
void rdr1( const map<uint64_t,uint64_t> &M1)
{
uint64_t ret=0;
for(int K=4;K<BOUND_LIM;K++)
 ret=getval(K,M1);

}

// checks if getval for count of value works
void rdr2(const map<uint64_t,uint64_t> &M2)
{
uint64_t ret=0;
for(int V=4;V<BOUND_LIM;V++)
ret=getcount(V,M2);

}

//multiple parallel threads setting values in the K-V store
void setxval(map<uint64_t,uint64_t> &M1,map<uint64_t,uint64_t>
&M2,stack<TUP> &STK)
{

 for(int K=2;K<VALUES_LIM;K++){
for(int V=3;V<VALUES_LIM;V++)
  setval(K,V,M1,M2,STK,true);
 }

}

void callerase(map<uint64_t,uint64_t> &M1,map<uint64_t,uint64_t>
&M2,stack<TUP> &STK)
{
for(int V=2;V<BOUND_LIM;V++)
 doerase(V,M1,M2,STK,true);
}
// once ina  while call start
void callstart(stack<TUP> &STK)
{
 dostart(STK);
}

// once in a while call commit

void callcommit(stack<TUP> &STK)
{
 docommit(STK);
}

// once in a while call rollback

void callrollback(stack<TUP> &STK ,map<uint64_t,uint64_t>
&M1,map<uint64_t,uint64_t> &M2)
{
 dorollback(STK ,M1,M2);
}

void call_set_cacheval(map<string,uint64_t> &M, uint64_t
&idx,map<uint64_t,string> &RM)
{
for(int K=10;K<VALUES_LIM;K++){
char buf[10];
sprintf(buf,"str_%d",K);
string tmp(buf);
set_cacheval(tmp,M,idx,RM);
}
}
/* End of multithreading stubs */

int main()
{
string s;
map<uint64_t,uint64_t> M1;
map<uint64_t,uint64_t> M2;
map<string,uint64_t > STRCACHE;
map<uint64_t,string> REVSTRCACHE;

stack< pair<int,pair<uint64_t,uint64_t> >  > STK;

// An intiali value other than 0 so that we have the index
uint64_t stridx(10);

thread tr1[BOUND_LIM];
thread tr2[BOUND_LIM];

thread tw[BOUND_LIM];

thread te[BOUND_LIM];

thread tg[BOUND_LIM];

for(int ii=0;ii<BOUND_LIM;ii++){
if((ii%8)==0) call_set_cacheval(STRCACHE,ref(stridx),REVSTRCACHE);

tr1[ii]=thread(rdr1,ref(M1));
tr2[ii]=thread(rdr2,ref(M2));

if((ii%4)==0){
te[ii]=thread(callerase,ref(M1),ref(M2),ref(STK));
}  else te[ii]=thread(dummy);

if((ii%2)==0){
tw[ii]=thread(setxval,ref(M1),ref(M2),ref(STK));
} else  tw[ii]=thread(dummy);
if(ii>VALUES_LIM){
if((ii%3)==0){
tg[ii]=thread(callrollback,ref(STK),ref(M1),ref(M2));
}
else if((ii%7)==0){
tg[ii]=thread(callcommit,ref(STK));
}
else if((ii%11)==0){
tg[ii]=thread(callrollback,ref(STK),ref(M1),ref(M2));
} else {
tg[ii]=thread(dummy);
}
} else {
tg[ii]=thread(dummy);
}
}

for(int ii=0;ii<BOUND_LIM;ii++){
tr1[ii].join();
tr2[ii].join();
tw[ii].join();
te[ii].join();
tg[ii].join();
}

printf("M1          -> %.5f \n",ax[0]);
printf("M2          -> %.5f \n",ax[1]);
printf("STK         -> %.5f \n",ax[2]);
printf("CACHE READ  -> %.5f \n",ax[3]);
printf("CACHE WRITE -> %.5f \n",ax[4]);
return 0;
}

Regards,
Navin
#include<iostream>
#include<cstdio>
#include<cstdint>
#include<map>
#include<stack>
#include<cassert>
#include<algorithm>
#include<atomic>
#include<mutex>
#include<thread>
#include<ctime>
using namespace std;
const char delim=',';

const int PARALLEL_TASKS=256;

const int VALUES_LIM=PARALLEL_TASKS;
const int BOUND_LIM=VALUES_LIM*2;

typedef pair<int,pair<uint64_t,uint64_t> > TUP;

mutex mut_M1,mut_M2,mut_STK,mut_cache_lock,mut_write_lock;
double ax[7];
const long long MYNANO=1e9;

void dummy(void){}

uint64_t ispM1(const uint64_t &K,map<uint64_t,uint64_t> &M){

	struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	lock_guard<mutex> guard(mut_M1);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	ax[0]+=(after-before)*1./MYNANO;

	if(M.find(K)==M.end()) return 0;
	return M[K];
}

uint64_t ispM2(const uint64_t &K,map<uint64_t,uint64_t> &M){

	struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	lock_guard<mutex> guard(mut_M2);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	ax[1]+=(after-before)*1./MYNANO;

	if(M.find(K)==M.end()) return 0;
	return M[K];
}

uint64_t ispcache(const string &K,map<string,uint64_t> &M){

	struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	lock_guard<mutex> guard(mut_write_lock);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	ax[4]+=(after - before)*1./MYNANO;

	if(M.find(K)==M.end()) return 0;
	return M[K];
}

void setvalcache(const string K,uint64_t &V,map<string,uint64_t> &M,map<uint64_t,string> &RM){

	struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	lock_guard<mutex>  guard(mut_write_lock);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	ax[4]+=(after-before)*1./MYNANO;

		uint64_t tmpx(V);
		M[K]=tmpx;
		RM[tmpx]=K;
}

uint64_t set_cacheval(const string str,map<string,uint64_t> &M, uint64_t &idx,map<uint64_t,string> &RM)
{
	struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	lock_guard<mutex>  guard(mut_cache_lock);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	ax[3]+=(after-before)*1./MYNANO;

	uint64_t val=ispcache(str,M);
	if(!val) 
	{
		setvalcache(str,idx,M,RM);
		val=idx;
		idx++;
	}  
	return val;

}

void deleteval(const uint64_t &K,map<uint64_t,uint64_t> &M)
{
	struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	lock_guard<mutex> guard(mut_M2);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	ax[1]+=(after-before)*1./MYNANO;

	uint64_t val=M[K];
	if(val==1){
		M.erase(K);
	} else {
		val--;
		M[K]=val;
	}
}

void updateval(const uint64_t &K,map<uint64_t,uint64_t> &M)
{
	struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	lock_guard<mutex> guard(mut_M2);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	ax[1]+=(after-before)*1./MYNANO;
	M[K]++;
}

void setval(const uint64_t &K,const uint64_t &V,map<uint64_t,uint64_t> &M1,map<uint64_t,uint64_t> &M2,stack<TUP> &STK,bool logtrans=true)
{

	uint64_t oldval(0);
	oldval=ispM1(K,M1);

	struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	lock_guard<mutex> guard_M1(mut_M1);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	ax[0]+=(after-before)*1./MYNANO;

	if(oldval==V) {
		return;
	}

	M1[K]=V;
	updateval(V,M2);
	deleteval(oldval,M2);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	lock_guard<mutex> guardstk(mut_STK);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	ax[2]+=(after-before)*1./MYNANO;

	if(logtrans && STK.size()){
		STK.push(pair<int,pair<uint64_t,uint64_t> >(3, pair<uint64_t,uint64_t> (K,oldval)));
		STK.push(pair<int,pair<uint64_t,uint64_t> >(3, pair<uint64_t,uint64_t> (K,V)));
	}

	return;
}

uint64_t getval(const uint64_t &K, const map<uint64_t,uint64_t> &M1) 
{

	struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	lock_guard<mutex> guard(mut_M1);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	ax[0]+=(after-before)*1./MYNANO;
	if(M1.count(K)) return M1.at(K);
	return 0;
}

void doerase(const uint64_t &K,map<uint64_t,uint64_t> &M1,map<uint64_t,uint64_t> &M2,stack<TUP> &STK,bool logtrans=true)
{
	struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;
	lock_guard<mutex> guard(mut_M1);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;
	ax[0]+=(after-before)*1./MYNANO;

	if(M1.find(K)!=M1.end()){
		uint64_t val=M1[K];
		deleteval(val,M2);
		M1.erase(K);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	before =(t1.tv_sec*MYNANO) + t1.tv_nsec;
		lock_guard<mutex> guard(mut_STK);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	ax[2]+=(after-before)*1./MYNANO;

		if(logtrans && STK.size()){
			STK.push(pair<int,pair<uint64_t,uint64_t> >(4,pair<uint64_t,uint64_t>(K,val)));
		}
	}
	return;
}

uint64_t getcount(const uint64_t &V,const map<uint64_t,uint64_t> &M2)
{
	struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	lock_guard<mutex> guard(mut_M2);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	ax[1]+=(after-before)*1./MYNANO;

	if(M2.count(V)) return M2.at(V);
	return 0;
}

void dostart(stack<TUP> &STK)
{
	struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	lock_guard<mutex> guard(mut_STK);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	ax[2]+=(after-before)*1./MYNANO;

	STK.push(pair<int,pair<uint64_t,uint64_t> > (0,pair<uint64_t,uint64_t>(0,0)));

}

void docommit(stack<TUP> &STK)
{
	struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	lock_guard<mutex> guard(mut_STK);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;
	ax[2]+=(after-before)*1./MYNANO;
	while(STK.size()){
		TUP t=STK.top();
		STK.pop();
		if(t.first==0) break;
	} 

}

void dorollback(stack<TUP> &STK ,map<uint64_t,uint64_t> &M1,map<uint64_t,uint64_t> &M2)
{
	struct timespec t1;
        int ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long before =(t1.tv_sec*MYNANO) + t1.tv_nsec;

	lock_guard<mutex> guard(mut_STK);

        ret=clock_gettime(CLOCK_MONOTONIC_RAW,&t1);
	long long after =(t1.tv_sec*MYNANO) + t1.tv_nsec;
	ax[2]+=(after-before)*1./MYNANO;

	while(STK.size()){
		TUP t=STK.top();
		STK.pop();
		if(t.first==3){

			TUP tprev=STK.top();
			STK.pop();

			assert(t.first==tprev.first);
			if(tprev.second.second==0){
				doerase(t.second.first,M1,M2,STK,false);
			} else {
				setval(t.second.first,tprev.second.second,M1,M2,STK,false);
			}

		} else if(t.first==4){ // Rolling back an erase

			setval(t.second.first,t.second.second,M1,M2,STK,false);

		}
		if(t.first==0) break;
	} 

}

/* Test stubs to check multithreading */

// Checks if getval for key value pair works
void rdr1( const map<uint64_t,uint64_t> &M1)
{

uint64_t ret=0;
for(int K=4;K<BOUND_LIM;K++)
 ret=getval(K,M1);

}

// checks if getval for count of value works
void rdr2(const map<uint64_t,uint64_t> &M2)
{
	uint64_t ret=0;
	for(int V=4;V<BOUND_LIM;V++)
	 ret=getcount(V,M2);

}

//multiple parallel threads setting values in the K-V store
void setxval(map<uint64_t,uint64_t> &M1,map<uint64_t,uint64_t> &M2,stack<TUP> &STK)
{

 for(int K=2;K<VALUES_LIM;K++){
	for(int V=3;V<VALUES_LIM;V++)
 	setval(K,V,M1,M2,STK,true);
 }

}

void callerase(map<uint64_t,uint64_t> &M1,map<uint64_t,uint64_t> &M2,stack<TUP> &STK)
{
for(int V=2;V<BOUND_LIM;V++)
 doerase(V,M1,M2,STK,true);
}
// once ina  while call start
void callstart(stack<TUP> &STK)
{
 dostart(STK);
}

// once in a while call commit

void callcommit(stack<TUP> &STK)
{
 docommit(STK);
}

// once in a while call rollback

void callrollback(stack<TUP> &STK ,map<uint64_t,uint64_t> &M1,map<uint64_t,uint64_t> &M2)
{
 dorollback(STK ,M1,M2);
}

void call_set_cacheval(map<string,uint64_t> &M, uint64_t &idx,map<uint64_t,string> &RM)
{
	for(int K=10;K<VALUES_LIM;K++){
		char buf[10];
		sprintf(buf,"str_%d",K);
		string tmp(buf);
		set_cacheval(tmp,M,idx,RM);
	}
}
/* End of multithreading stubs */

int main()
{
	string s;
	map<uint64_t,uint64_t> M1;
	map<uint64_t,uint64_t> M2;
	map<string,uint64_t > STRCACHE;
	map<uint64_t,string> REVSTRCACHE;

	stack< pair<int,pair<uint64_t,uint64_t> >  > STK;

	// An intiali value other than 0 so that we have the index
	uint64_t stridx(10);

	thread tr1[BOUND_LIM];
	thread tr2[BOUND_LIM];

	thread tw[BOUND_LIM];

	thread te[BOUND_LIM];

	thread tg[BOUND_LIM];

	for(int ii=0;ii<BOUND_LIM;ii++){
		if((ii%8)==0) call_set_cacheval(STRCACHE,ref(stridx),REVSTRCACHE);

		tr1[ii]=thread(rdr1,ref(M1));
		tr2[ii]=thread(rdr2,ref(M2));

		if((ii%4)==0){
			te[ii]=thread(callerase,ref(M1),ref(M2),ref(STK));
		}  else te[ii]=thread(dummy);

		if((ii%2)==0){
			tw[ii]=thread(setxval,ref(M1),ref(M2),ref(STK));
		} else  tw[ii]=thread(dummy);

		if(ii>VALUES_LIM){
			if((ii%3)==0){
				tg[ii]=thread(callrollback,ref(STK),ref(M1),ref(M2));
			}
			else if((ii%7)==0){
				tg[ii]=thread(callcommit,ref(STK));
			}
			else if((ii%11)==0){
				tg[ii]=thread(callrollback,ref(STK),ref(M1),ref(M2));
			} else {
				tg[ii]=thread(dummy);
			}
		} else {
			tg[ii]=thread(dummy);
		}

	}

	for(int ii=0;ii<BOUND_LIM;ii++){
		tr1[ii].join(); 
		tr2[ii].join();
		tw[ii].join();
		te[ii].join();
		tg[ii].join();
	}

	printf("M1          -> %.5f \n",ax[0]);
	printf("M2          -> %.5f \n",ax[1]);
	printf("STK         -> %.5f \n",ax[2]);
	printf("CACHE READ  -> %.5f \n",ax[3]);
	printf("CACHE WRITE -> %.5f \n",ax[4]);
	return 0;
}