Hello. At Wed, 2 Oct 2019 19:24:02 -0400, Stephen Frost <sfrost@xxxxxxxxxxx> wrote in > Greetings, > > * Tom Lane (tgl@xxxxxxxxxxxxx) wrote: > > Stephen Frost <sfrost@xxxxxxxxxxx> writes: > > > * Aleš Zelený (zeleny.ales@xxxxxxxxx) wrote: > > >> But recovery on replica failed to proceed WAL file > > >> 0000000100000FED00000039 with log message: " invalid contrecord length > > >> 1956 at FED/38FFE208". > > > > > Err- you've drawn the wrong conclusion from that message (and you're > > > certainly not alone- it's a terrible message and we should really have a > > > HINT there or something). > > > > Yeah, those messages are all pretty ancient, from when WAL was new and not > > to be trusted much. Perhaps the thing to do is move the existing info > > into DETAIL and make the primary message be something like "reached > > apparent end of WAL stream". > > Yes, +1 on that. The attached is something like that. AFAICS we can assume that an invalid record means the end of WAL as long as we are fetching successive records and XLogPageRead() has not logged anything. As the current comment in ReadRecord says, we don't have a message if standby has been triggered, but that's not always the case. Still we may have a message if new segment is available after triggering. I used a kind-of-tricky way to handle optional errdetail but it may be better to have some additional feature as an ereport subfunction. (Maybe named "errif" or something?) I think it works fine as expected but I find one arguable behavior. In the first patch, the "end of WAL" message is accompanied by source, LSN and timeline. LOG: reached end of WAL in archive at 0/A3000060 on timeline 8 DETAIL: invalid record length at 0/A3000060: wanted 24, got 0 The last two just seems good but the first one shows "archive" even on promotion. It is right that we scan archive after promotion but seems confusing. So I attached another patch that shows operation instead of WAL source. LOG: reached end of WAL during streaming reaplication at 0/A5000060 on timeline 8 DETAIL: invalid record length at 0/A5000060: wanted 24, got 0 What do you think about this? regards. -- Kyotaro Horiguchi NTT Open Source Software Center
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 2bd3d0e5e5..70fd34659a 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -852,6 +852,9 @@ static bool bgwriterLaunched = false; static int MyLockNo = 0; static bool holdingAllLocks = false; +/* Have we complaind about the record at the location? */ +static XLogRecPtr lastComplaint = 0; + #ifdef WAL_DEBUG static MemoryContext walDebugCxt = NULL; #endif @@ -889,6 +892,7 @@ static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, bool fetching_ckpt, XLogRecPtr tliRecPtr); static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr); +static bool have_complained_at(XLogRecPtr RecPtr); static void XLogFileClose(void); static void PreallocXlogFiles(XLogRecPtr endptr); static void RemoveTempXlogFiles(void); @@ -4264,8 +4268,11 @@ ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode, record = XLogReadRecord(xlogreader, RecPtr, &errormsg); ReadRecPtr = xlogreader->ReadRecPtr; EndRecPtr = xlogreader->EndRecPtr; + if (record == NULL) { + XLogRecPtr reportptr = RecPtr ? RecPtr : EndRecPtr; + if (readFile >= 0) { close(readFile); @@ -4273,13 +4280,24 @@ ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode, } /* - * We only end up here without a message when XLogPageRead() - * failed - in that case we already logged something. + * When we end up here while reading successive records, we reached + * the end of WAL unless XLogPageRead() has logged something. We + * may or may not have a message here depending on the situation. + * Otherwise we just report the received message if any and if + * needed. + * + * Note: errormsg is alreay translated. */ - if (errormsg) - ereport(emode_for_corrupt_record(emode, - RecPtr ? RecPtr : EndRecPtr), - (errmsg_internal("%s", errormsg) /* already translated */ )); + if (RecPtr == InvalidXLogRecPtr && !have_complained_at(EndRecPtr)) + ereport(emode, + (errmsg("reached end of WAL in %s at %X/%X on timeline %u", + xlogSourceNames[currentSource], + (uint32) (reportptr >> 32), (uint32) reportptr, + ThisTimeLineID), + (errormsg ? errdetail_internal("%s", errormsg) : 0))); + else if (errormsg) + ereport(emode_for_corrupt_record(emode, reportptr), + (errmsg_internal("%s", errormsg))); } /* @@ -4331,7 +4349,7 @@ ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode, !fetching_ckpt) { ereport(DEBUG1, - (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery"))); + (errmsg_internal("entering archive recovery"))); InArchiveRecovery = true; if (StandbyModeRequested) StandbyMode = true; @@ -12160,8 +12178,6 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr) { - static XLogRecPtr lastComplaint = 0; - if (readSource == XLOG_FROM_PG_WAL && emode == LOG) { if (RecPtr == lastComplaint) @@ -12172,6 +12188,13 @@ emode_for_corrupt_record(int emode, XLogRecPtr RecPtr) return emode; } +/* Have we complained about the record at the location? */ +static bool +have_complained_at(XLogRecPtr RecPtr) +{ + return lastComplaint == RecPtr; +} + /* * Check to see whether the user-specified trigger file exists and whether a * promote request has arrived. If either condition holds, return true.
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 2bd3d0e5e5..5f1f7e5d7e 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -852,6 +852,9 @@ static bool bgwriterLaunched = false; static int MyLockNo = 0; static bool holdingAllLocks = false; +/* Have we complaind about the record at the location? */ +static XLogRecPtr lastComplaint = 0; + #ifdef WAL_DEBUG static MemoryContext walDebugCxt = NULL; #endif @@ -889,6 +892,7 @@ static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, bool fetching_ckpt, XLogRecPtr tliRecPtr); static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr); +static bool have_complained_at(XLogRecPtr RecPtr); static void XLogFileClose(void); static void PreallocXlogFiles(XLogRecPtr endptr); static void RemoveTempXlogFiles(void); @@ -4266,6 +4270,8 @@ ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode, EndRecPtr = xlogreader->EndRecPtr; if (record == NULL) { + XLogRecPtr reportptr = RecPtr ? RecPtr : EndRecPtr; + if (readFile >= 0) { close(readFile); @@ -4273,13 +4279,33 @@ ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode, } /* - * We only end up here without a message when XLogPageRead() - * failed - in that case we already logged something. + * When we end up here while reading successive records, we reached + * the end of WAL unless XLogPageRead() has logged something. We + * may or may not have a message here depending on the situation. + * Otherwise we just report the received message if any and if + * needed. + * + * Note: errormsg is alreay translated. */ - if (errormsg) - ereport(emode_for_corrupt_record(emode, - RecPtr ? RecPtr : EndRecPtr), - (errmsg_internal("%s", errormsg) /* already translated */ )); + if (RecPtr == InvalidXLogRecPtr && !have_complained_at(EndRecPtr)) + { + char *operation = "crash recovery"; + + if (StandbyMode) + operation = "streaming reaplication"; + else if (InArchiveRecovery) + operation = "archive recovery"; + + ereport(emode, + (errmsg("reached end of WAL during %s at %X/%X on timeline %u", + operation, + (uint32) (reportptr >> 32), (uint32) reportptr, + ThisTimeLineID), + (errormsg ? errdetail_internal("%s", errormsg) : 0))); + } + else if (errormsg) + ereport(emode_for_corrupt_record(emode, reportptr), + (errmsg_internal("%s", errormsg))); } /* @@ -4331,7 +4357,7 @@ ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode, !fetching_ckpt) { ereport(DEBUG1, - (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery"))); + (errmsg_internal("entering archive recovery"))); InArchiveRecovery = true; if (StandbyModeRequested) StandbyMode = true; @@ -12160,8 +12186,6 @@ WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr) { - static XLogRecPtr lastComplaint = 0; - if (readSource == XLOG_FROM_PG_WAL && emode == LOG) { if (RecPtr == lastComplaint) @@ -12172,6 +12196,13 @@ emode_for_corrupt_record(int emode, XLogRecPtr RecPtr) return emode; } +/* Have we complained about the record at the location? */ +static bool +have_complained_at(XLogRecPtr RecPtr) +{ + return lastComplaint == RecPtr; +} + /* * Check to see whether the user-specified trigger file exists and whether a * promote request has arrived. If either condition holds, return true.