Re: I/O on select count(*)

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Tom Lane wrote:
Hmm, the problem would be trying to figure out what percentage of writes
could be blamed solely on hint-bit updates and not any other change to
the page.  I don't think that the bufmgr currently keeps enough state to
know that, but you could probably modify it easily enough, since callers
distinguish MarkBufferDirty from SetBufferCommitInfoNeedsSave.  Define
another flag bit that's set only by the first, and test it during
write-out.
Ok, I made a few changes to bufmgr per my understanding of your description above and with my limited understanding of the code. Patch is attached.

Assuming the patch is correct, the effect of writes due to hint bits is quite significant. I collected the data below by runing pgbench in one terminal and psql on another to run the query.

Is the data plausible?

-Robert

--------------


Backend PID    : 16189
SQL Statement  : select count(*) from accounts;
Execution time : 17.33 sec

============ Buffer Read Counts ============
Tablespace   Database      Table      Count
     1663      16384       2600          1
     1663      16384       2601          1
     1663      16384       2615          1
     1663      16384       1255          2
     1663      16384       2602          2
     1663      16384       2603          2
     1663      16384       2616          2
     1663      16384       2650          2
     1663      16384       2678          2
     1663      16384       1247          3
     1663      16384       1249          3
     1663      16384       2610          3
     1663      16384       2655          3
     1663      16384       2679          3
     1663      16384       2684          3
     1663      16384       2687          3
     1663      16384       2690          3
     1663      16384       2691          3
     1663      16384       2703          4
     1663      16384       1259          5
     1663      16384       2653          5
     1663      16384       2662          5
     1663      16384       2663          5
     1663      16384       2659          7
     1663      16384      16397       8390

======== Dirty Buffer Write Counts =========
Tablespace   Database      Table      Count
     1663      16384      16402          2
     1663      16384      16394         11
     1663      16384      16397       4771

========== Hint Bits Write Counts ==========
Tablespace   Database      Table      Count
     1663      16384      16397       4508

Total buffer cache hits      :       732
Total buffer cache misses    :      7731
Average read time from cache :      9136 (ns)
Average read time from disk  :    384201 (ns)
Average write time to disk   :    210709 (ns)


Backend PID    : 16189
SQL Statement  : select count(*) from accounts;
Execution time : 12.72 sec

============ Buffer Read Counts ============
Tablespace   Database      Table      Count
     1663      16384      16397       8392

======== Dirty Buffer Write Counts =========
Tablespace   Database      Table      Count
     1663      16384      16394          6
     1663      16384      16402          7
     1663      16384      16397       2870

========== Hint Bits Write Counts ==========
Tablespace   Database      Table      Count
     1663      16384      16402          2
     1663      16384      16397       2010

Total buffer cache hits      :       606
Total buffer cache misses    :      7786
Average read time from cache :      6949 (ns)
Average read time from disk  :    706288 (ns)
Average write time to disk   :     90426 (ns)

Index: bufmgr.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v
retrieving revision 1.228
diff -u -3 -p -r1.228 bufmgr.c
--- bufmgr.c	1 Jan 2008 19:45:51 -0000	1.228
+++ bufmgr.c	15 May 2008 20:56:38 -0000
@@ -42,6 +42,7 @@
 #include "storage/smgr.h"
 #include "utils/resowner.h"
 #include "pgstat.h"
+#include "pg_trace.h"
 
 
 /* Note: these two macros only work on shared buffers, not local ones! */
@@ -171,6 +172,7 @@ ReadBuffer_common(Relation reln, BlockNu
 	if (isExtend)
 		blockNum = smgrnblocks(reln->rd_smgr);
 
+	TRACE_POSTGRESQL_BUFFER_READ_START(blockNum, reln->rd_node.spcNode, reln->rd_node.dbNode, reln->rd_node.relNode, isLocalBuf);
 	pgstat_count_buffer_read(reln);
 
 	if (isLocalBuf)
@@ -200,12 +202,16 @@ ReadBuffer_common(Relation reln, BlockNu
 	{
 		if (!isExtend)
 		{
+			TRACE_POSTGRESQL_BUFFER_HIT();
 			/* Just need to update stats before we exit */
 			pgstat_count_buffer_hit(reln);
 
 			if (VacuumCostActive)
 				VacuumCostBalance += VacuumCostPageHit;
 
+			TRACE_POSTGRESQL_BUFFER_READ_DONE(blockNum,
+				 reln->rd_node.spcNode, reln->rd_node.dbNode,
+				 reln->rd_node.relNode, isLocalBuf, found);
 			return BufferDescriptorGetBuffer(bufHdr);
 		}
 
@@ -257,6 +263,7 @@ ReadBuffer_common(Relation reln, BlockNu
 			} while (!StartBufferIO(bufHdr, true));
 		}
 	}
+	TRACE_POSTGRESQL_BUFFER_MISS();
 
 	/*
 	 * if we have gotten to this point, we have allocated a buffer for the
@@ -324,6 +331,9 @@ ReadBuffer_common(Relation reln, BlockNu
 	if (VacuumCostActive)
 		VacuumCostBalance += VacuumCostPageMiss;
 
+	TRACE_POSTGRESQL_BUFFER_READ_DONE(blockNum, reln->rd_node.spcNode,
+		reln->rd_node.dbNode, reln->rd_node.relNode, isLocalBuf, found);
+
 	return BufferDescriptorGetBuffer(bufHdr);
 }
 
@@ -466,6 +476,11 @@ BufferAlloc(Relation reln,
 			 * happens to be trying to split the page the first one got from
 			 * StrategyGetBuffer.)
 			 */
+
+			TRACE_POSTGRESQL_DIRTY_BUFFER_WRITE_START(blockNum,
+			  reln->rd_node.spcNode, reln->rd_node.dbNode,
+			  reln->rd_node.relNode);
+
 			if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED))
 			{
 				/*
@@ -488,6 +503,11 @@ BufferAlloc(Relation reln,
 				/* OK, do the I/O */
 				FlushBuffer(buf, NULL);
 				LWLockRelease(buf->content_lock);
+
+				TRACE_POSTGRESQL_DIRTY_BUFFER_WRITE_DONE(
+				  blockNum, reln->rd_node.spcNode,
+				  reln->rd_node.dbNode, reln->rd_node.relNode,
+				  (oldFlags & BM_HINT_BITS_TEST));
 			}
 			else
 			{
@@ -1734,6 +1754,10 @@ FlushBuffer(volatile BufferDesc *buf, SM
 	buf->flags &= ~BM_JUST_DIRTIED;
 	UnlockBufHdr(buf);
 
+	TRACE_POSTGRESQL_BUFFER_WRITE_START(buf->tag.blockNum,
+		 reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode,
+		 reln->smgr_rnode.relNode);
+
 	smgrwrite(reln,
 			  buf->tag.blockNum,
 			  (char *) BufHdrGetBlock(buf),
@@ -1741,6 +1765,10 @@ FlushBuffer(volatile BufferDesc *buf, SM
 
 	BufferFlushCount++;
 
+	TRACE_POSTGRESQL_BUFFER_WRITE_DONE(buf->tag.blockNum,
+		 reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode,
+		 reln->smgr_rnode.relNode, (buf->flags & BM_HINT_BITS_TEST));
+
 	/*
 	 * Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
 	 * end the io_in_progress state.
@@ -2155,7 +2183,7 @@ SetBufferCommitInfoNeedsSave(Buffer buff
 		Assert(bufHdr->refcount > 0);
 		if (!(bufHdr->flags & BM_DIRTY) && VacuumCostActive)
 			VacuumCostBalance += VacuumCostPageDirty;
-		bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
+		bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED | BM_HINT_BITS_TEST);
 		UnlockBufHdr(bufHdr);
 	}
 }
@@ -2492,7 +2520,7 @@ TerminateBufferIO(volatile BufferDesc *b
 	Assert(buf->flags & BM_IO_IN_PROGRESS);
 	buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
 	if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
-		buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
+		buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED | BM_HINT_BITS_TEST);
 	buf->flags |= set_flag_bits;
 
 	UnlockBufHdr(buf);
Index: buf_internals.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/storage/buf_internals.h,v
retrieving revision 1.95
diff -u -3 -p -r1.95 buf_internals.h
--- buf_internals.h	1 Jan 2008 19:45:58 -0000	1.95
+++ buf_internals.h	15 May 2008 20:59:42 -0000
@@ -36,6 +36,7 @@
 #define BM_JUST_DIRTIED			(1 << 5)		/* dirtied since write started */
 #define BM_PIN_COUNT_WAITER		(1 << 6)		/* have waiter for sole pin */
 #define BM_CHECKPOINT_NEEDED	(1 << 7)		/* must write for checkpoint */
+#define BM_HINT_BITS_TEST	(1 << 8)		/* test effect of writes due to hint bits */
 
 typedef bits16 BufFlags;
 

[Postgresql General]     [Postgresql PHP]     [PHP Users]     [PHP Home]     [PHP on Windows]     [Kernel Newbies]     [PHP Classes]     [PHP Books]     [PHP Databases]     [Yosemite]

  Powered by Linux