On Wed, 31 Aug 2005, Darryl L. Miles wrote:
My config entry (base on the example for Apache common format):
logformat combined %>a %ui %un [%tl] "%rm %ru HTTP/%rv" %Hs %<st
"%{Referer}>h" "%{User-Agent}>h" %Ss:%Sh
The problem affect my logfile stats program being unable to parse the line.
Looks like someone is trawling for an awstats.pl bug. An example entry is:
WARN:a1cpu4.bz.log:1786006 parse error for length at w;wget"
WARN: 213.61.102.218 - - [15/Aug/2005:22:39:01 +0100] "GET
http://62.XX.XX.109//awstats.pl"w;wget" HTTP/1.1" 404 454 "-"
"Mozilla/4.0(compatible; MSIE 6.0; Windows 98)" TCP_MISS:DIRECT
What I expected to see was:
"GET http://62.XX.XX.109//awstats.pl"w;wget" HTTP/1.1"
into (with additional \ character) which would be what Apache does:
"GET http://62.XX.XX.109//awstats.pl\"w;wget" HTTP/1.1"
Right, the quoting selection magics currently doesn't handle this case
very well.. defaulting to use no quoting of the URL data.
You should get the expected output if explicitly select the quoted
string output format for the URL field:
logformat combined %>a %ui %un [%tl] "%rm %"ru HTTP/%rv" %Hs %<st "%{Referer}>h" "%{User-Agent}>h" %Ss:%Sh
The attached incremental patch should correct the logformat directive to
automatically use quoted string escaping on any format element found
within a quoted string (not only when the quotes is immediately around the
item as in the Referer and User-Agent cases), and similarily for braketed
items. I have also tried to make the description of the format selectors
perhaps a little easier to understand.
Regards
Henrik
Index: src/cf.data.pre
===================================================================
RCS file: /cvsroot/squid/squid/src/cf.data.pre,v
retrieving revision 1.49.2.40.2.16
diff -u -r1.49.2.40.2.16 cf.data.pre
--- src/cf.data.pre 27 May 2005 23:49:29 -0000 1.49.2.40.2.16
+++ src/cf.data.pre 1 Sep 2005 19:23:05 -0000
@@ -847,17 +847,18 @@
The <format specification> is a string with embedded % format codes
% format codes all follow the same basic structure where all but
- the formatcode is optional. Output strings are automatically quoted
+ the formatcode is optional. Output strings are automatically escaped
as required according to their context and the output format
- modifiers are usually unneeded but can be specified if an explicit
- quoting format is desired.
+ modifiers are usually not needed, but can be specified if an explicit
+ output format is desired.
% ["|[|'|#] [-] [[0]width] [{argument}] formatcode
- " quoted string output format
- [ squid log quoted format as used by log_mime_hdrs
- # URL quoted output format
- ' No automatic quoting
+ " output in quoted string format
+ [ output in squid text log format as used by log_mime_hdrs
+ # output in URL quoted format
+ ' output as-is
+
- left aligned
width field width. If starting with 0 then the
output is zero padded
Index: src/access_log.c
===================================================================
RCS file: /cvsroot/squid/squid/src/access_log.c,v
retrieving revision 1.15.6.3.2.13
diff -u -r1.15.6.3.2.13 access_log.c
--- src/access_log.c 27 May 2005 04:34:12 -0000 1.15.6.3.2.13
+++ src/access_log.c 1 Sep 2005 19:23:05 -0000
@@ -718,7 +718,7 @@
* def is for sure null-terminated
*/
static int
-accessLogGetNewLogFormatToken(logformat_token * lt, char *def, char *last)
+accessLogGetNewLogFormatToken(logformat_token * lt, char *def, enum log_quote *quote)
{
char *cur = def;
struct logformat_token_table_entry *lte;
@@ -733,8 +733,26 @@
xstrncpy(cp, cur, l + 1);
lt->type = LFT_STRING;
lt->data.string = cp;
- *last = cur[l - 1];
- cur += l;
+ while (l > 0) {
+ switch(*cur) {
+ case '"':
+ if (*quote == LOG_QUOTE_NONE)
+ *quote = LOG_QUOTE_QUOTES;
+ else if (*quote == LOG_QUOTE_QUOTES)
+ *quote = LOG_QUOTE_NONE;
+ break;
+ case '[':
+ if (*quote == LOG_QUOTE_NONE)
+ *quote = LOG_QUOTE_BRAKETS;
+ break;
+ case ']':
+ if (*quote == LOG_QUOTE_BRAKETS)
+ *quote = LOG_QUOTE_NONE;
+ break;
+ }
+ cur++;
+ l--;
+ }
goto done;
}
if (!*cur)
@@ -757,6 +775,9 @@
lt->quote = LOG_QUOTE_URL;
cur++;
break;
+ default:
+ lt->quote = *quote;
+ break;
}
if (*cur == '-') {
lt->left = 1;
@@ -793,12 +814,6 @@
fatalf("Can't parse configuration token: '%s'\n",
def);
}
- if (!lt->quote) {
- if (*last == '"' && *cur == '"')
- lt->quote = LOG_QUOTE_QUOTES;
- else if (*last == '[' && *cur == ']')
- lt->quote = LOG_QUOTE_BRAKETS;
- }
if (*cur == ' ') {
lt->space = 1;
cur++;
@@ -854,7 +869,7 @@
{
char *cur, *eos;
logformat_token *new_lt, *last_lt;
- char last = '\0';
+ enum log_quote quote = LOG_QUOTE_NONE;
debug(46, 1) ("accessLogParseLogFormat: got definition '%s'\n", def);
@@ -865,12 +880,12 @@
cur = def;
eos = def + strlen(def);
*fmt = new_lt = last_lt = xmalloc(sizeof(logformat_token));
- cur += accessLogGetNewLogFormatToken(new_lt, cur, &last);
+ cur += accessLogGetNewLogFormatToken(new_lt, cur, "e);
while (cur < eos) {
new_lt = xmalloc(sizeof(logformat_token));
last_lt->next = new_lt;
last_lt = new_lt;
- cur += accessLogGetNewLogFormatToken(new_lt, cur, &last);
+ cur += accessLogGetNewLogFormatToken(new_lt, cur, "e);
}
return 1;
}