Tom Lane wrote:
Hmm, the problem would be trying to figure out what percentage of writes
could be blamed solely on hint-bit updates and not any other change to
the page. I don't think that the bufmgr currently keeps enough state to
know that, but you could probably modify it easily enough, since callers
distinguish MarkBufferDirty from SetBufferCommitInfoNeedsSave. Define
another flag bit that's set only by the first, and test it during
write-out.
Ok, I made a few changes to bufmgr per my understanding of your
description above and with my limited understanding of the code. Patch
is attached.
Assuming the patch is correct, the effect of writes due to hint bits is
quite significant. I collected the data below by runing pgbench in one
terminal and psql on another to run the query.
Is the data plausible?
-Robert
--------------
Backend PID : 16189
SQL Statement : select count(*) from accounts;
Execution time : 17.33 sec
============ Buffer Read Counts ============
Tablespace Database Table Count
1663 16384 2600 1
1663 16384 2601 1
1663 16384 2615 1
1663 16384 1255 2
1663 16384 2602 2
1663 16384 2603 2
1663 16384 2616 2
1663 16384 2650 2
1663 16384 2678 2
1663 16384 1247 3
1663 16384 1249 3
1663 16384 2610 3
1663 16384 2655 3
1663 16384 2679 3
1663 16384 2684 3
1663 16384 2687 3
1663 16384 2690 3
1663 16384 2691 3
1663 16384 2703 4
1663 16384 1259 5
1663 16384 2653 5
1663 16384 2662 5
1663 16384 2663 5
1663 16384 2659 7
1663 16384 16397 8390
======== Dirty Buffer Write Counts =========
Tablespace Database Table Count
1663 16384 16402 2
1663 16384 16394 11
1663 16384 16397 4771
========== Hint Bits Write Counts ==========
Tablespace Database Table Count
1663 16384 16397 4508
Total buffer cache hits : 732
Total buffer cache misses : 7731
Average read time from cache : 9136 (ns)
Average read time from disk : 384201 (ns)
Average write time to disk : 210709 (ns)
Backend PID : 16189
SQL Statement : select count(*) from accounts;
Execution time : 12.72 sec
============ Buffer Read Counts ============
Tablespace Database Table Count
1663 16384 16397 8392
======== Dirty Buffer Write Counts =========
Tablespace Database Table Count
1663 16384 16394 6
1663 16384 16402 7
1663 16384 16397 2870
========== Hint Bits Write Counts ==========
Tablespace Database Table Count
1663 16384 16402 2
1663 16384 16397 2010
Total buffer cache hits : 606
Total buffer cache misses : 7786
Average read time from cache : 6949 (ns)
Average read time from disk : 706288 (ns)
Average write time to disk : 90426 (ns)
Index: bufmgr.c
===================================================================
RCS file: /projects/cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v
retrieving revision 1.228
diff -u -3 -p -r1.228 bufmgr.c
--- bufmgr.c 1 Jan 2008 19:45:51 -0000 1.228
+++ bufmgr.c 15 May 2008 20:56:38 -0000
@@ -42,6 +42,7 @@
#include "storage/smgr.h"
#include "utils/resowner.h"
#include "pgstat.h"
+#include "pg_trace.h"
/* Note: these two macros only work on shared buffers, not local ones! */
@@ -171,6 +172,7 @@ ReadBuffer_common(Relation reln, BlockNu
if (isExtend)
blockNum = smgrnblocks(reln->rd_smgr);
+ TRACE_POSTGRESQL_BUFFER_READ_START(blockNum, reln->rd_node.spcNode, reln->rd_node.dbNode, reln->rd_node.relNode, isLocalBuf);
pgstat_count_buffer_read(reln);
if (isLocalBuf)
@@ -200,12 +202,16 @@ ReadBuffer_common(Relation reln, BlockNu
{
if (!isExtend)
{
+ TRACE_POSTGRESQL_BUFFER_HIT();
/* Just need to update stats before we exit */
pgstat_count_buffer_hit(reln);
if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageHit;
+ TRACE_POSTGRESQL_BUFFER_READ_DONE(blockNum,
+ reln->rd_node.spcNode, reln->rd_node.dbNode,
+ reln->rd_node.relNode, isLocalBuf, found);
return BufferDescriptorGetBuffer(bufHdr);
}
@@ -257,6 +263,7 @@ ReadBuffer_common(Relation reln, BlockNu
} while (!StartBufferIO(bufHdr, true));
}
}
+ TRACE_POSTGRESQL_BUFFER_MISS();
/*
* if we have gotten to this point, we have allocated a buffer for the
@@ -324,6 +331,9 @@ ReadBuffer_common(Relation reln, BlockNu
if (VacuumCostActive)
VacuumCostBalance += VacuumCostPageMiss;
+ TRACE_POSTGRESQL_BUFFER_READ_DONE(blockNum, reln->rd_node.spcNode,
+ reln->rd_node.dbNode, reln->rd_node.relNode, isLocalBuf, found);
+
return BufferDescriptorGetBuffer(bufHdr);
}
@@ -466,6 +476,11 @@ BufferAlloc(Relation reln,
* happens to be trying to split the page the first one got from
* StrategyGetBuffer.)
*/
+
+ TRACE_POSTGRESQL_DIRTY_BUFFER_WRITE_START(blockNum,
+ reln->rd_node.spcNode, reln->rd_node.dbNode,
+ reln->rd_node.relNode);
+
if (LWLockConditionalAcquire(buf->content_lock, LW_SHARED))
{
/*
@@ -488,6 +503,11 @@ BufferAlloc(Relation reln,
/* OK, do the I/O */
FlushBuffer(buf, NULL);
LWLockRelease(buf->content_lock);
+
+ TRACE_POSTGRESQL_DIRTY_BUFFER_WRITE_DONE(
+ blockNum, reln->rd_node.spcNode,
+ reln->rd_node.dbNode, reln->rd_node.relNode,
+ (oldFlags & BM_HINT_BITS_TEST));
}
else
{
@@ -1734,6 +1754,10 @@ FlushBuffer(volatile BufferDesc *buf, SM
buf->flags &= ~BM_JUST_DIRTIED;
UnlockBufHdr(buf);
+ TRACE_POSTGRESQL_BUFFER_WRITE_START(buf->tag.blockNum,
+ reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode);
+
smgrwrite(reln,
buf->tag.blockNum,
(char *) BufHdrGetBlock(buf),
@@ -1741,6 +1765,10 @@ FlushBuffer(volatile BufferDesc *buf, SM
BufferFlushCount++;
+ TRACE_POSTGRESQL_BUFFER_WRITE_DONE(buf->tag.blockNum,
+ reln->smgr_rnode.spcNode, reln->smgr_rnode.dbNode,
+ reln->smgr_rnode.relNode, (buf->flags & BM_HINT_BITS_TEST));
+
/*
* Mark the buffer as clean (unless BM_JUST_DIRTIED has become set) and
* end the io_in_progress state.
@@ -2155,7 +2183,7 @@ SetBufferCommitInfoNeedsSave(Buffer buff
Assert(bufHdr->refcount > 0);
if (!(bufHdr->flags & BM_DIRTY) && VacuumCostActive)
VacuumCostBalance += VacuumCostPageDirty;
- bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED);
+ bufHdr->flags |= (BM_DIRTY | BM_JUST_DIRTIED | BM_HINT_BITS_TEST);
UnlockBufHdr(bufHdr);
}
}
@@ -2492,7 +2520,7 @@ TerminateBufferIO(volatile BufferDesc *b
Assert(buf->flags & BM_IO_IN_PROGRESS);
buf->flags &= ~(BM_IO_IN_PROGRESS | BM_IO_ERROR);
if (clear_dirty && !(buf->flags & BM_JUST_DIRTIED))
- buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED);
+ buf->flags &= ~(BM_DIRTY | BM_CHECKPOINT_NEEDED | BM_HINT_BITS_TEST);
buf->flags |= set_flag_bits;
UnlockBufHdr(buf);
Index: buf_internals.h
===================================================================
RCS file: /projects/cvsroot/pgsql/src/include/storage/buf_internals.h,v
retrieving revision 1.95
diff -u -3 -p -r1.95 buf_internals.h
--- buf_internals.h 1 Jan 2008 19:45:58 -0000 1.95
+++ buf_internals.h 15 May 2008 20:59:42 -0000
@@ -36,6 +36,7 @@
#define BM_JUST_DIRTIED (1 << 5) /* dirtied since write started */
#define BM_PIN_COUNT_WAITER (1 << 6) /* have waiter for sole pin */
#define BM_CHECKPOINT_NEEDED (1 << 7) /* must write for checkpoint */
+#define BM_HINT_BITS_TEST (1 << 8) /* test effect of writes due to hint bits */
typedef bits16 BufFlags;