Re: dash bug: double-quoted "\" breaks glob protection for next char

Harald van Dijk <harald@xxxxxxxxxxx> · Thu, 8 Mar 2018 01:40:32 +0100

On 3/7/18 8:04 PM, Harald van Dijk wrote:
On 3/7/18 5:29 PM, Herbert Xu wrote:
On Sun, Mar 04, 2018 at 10:29:25PM +0100, Harald van Dijk wrote:
Another pre-existing dash parsing bug that I encountered now is $(( ( 
$(( 1
)) ) )). This should expand to 1, but gives a hard error in dash, 
again due
to the non-recursive nature of dash's parser. A small generalisation 
of what
I've been doing so far could handle this, so it makes sense to me to 
try to
achieve this at the same time.
Thanks for the patches.  You have convinced that a stacked syntax
is indeed necessary.  However, I'd like to do the reversion of the
run-time quote book-keeping patch in a separate step.

I also didn't quite like the idea of scanning the string backwards
to find the previous syntax.  So here is my attempt at the recursive
parsing using alloca.
If the syntax stack is to be stored on the actual stack, then real 
recursion could be used instead, as attached. I tried to avoid recursion 
for the cases that unpatched dash already handled properly.
It's not completely recursive in that I've
maintained the existing behaviour of double-quotes inside parameter
expansion inside double-quotes.  However, it does seem to address
most of the issues that you have raised.
One thing I found it doesn't handle, although it does look like you try 
to support it, is ${$-"}"}, which should expand to the same thing as $$. 
This doesn't work because the first " is CDQUOTE, not CENDQUOTE, so 
synstack->innerdq doesn't get set, and there's nothing else that 
prevents the quoted } from being treated as the end of the variable 
substitution.
To handle that it can just look at dblquote instead, included in the 
attached.
In this patch, I managed let "${$+\}}" expand to }, but "${$+"\}"}" to 
\}, for the reasons I gave earlier.
Martijn Dekker's test case of 'x=0; x=$((${x}+1))' also works with this.

Cheers,
Harald van Dijk

diff --git a/src/expand.c b/src/expand.c
index 2a50830..903e250 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -83,7 +83,7 @@
 #define RMESCAPE_HEAP	0x10	/* Malloc strings instead of stalloc */
 
 /* Add CTLESC when necessary. */
-#define QUOTES_ESC	(EXP_FULL | EXP_CASE | EXP_QPAT)
+#define QUOTES_ESC	(EXP_FULL | EXP_CASE)
 /* Do not skip NUL characters. */
 #define QUOTES_KEEPNUL	EXP_TILDE
 
@@ -333,16 +333,6 @@ addquote:
 		case CTLESC:
 			startloc++;
 			length++;
-
-			/*
-			 * Quoted parameter expansion pattern: remove quote
-			 * unless inside inner quotes or we have a literal
-			 * backslash.
-			 */
-			if (((flag | inquotes) & (EXP_QPAT | EXP_QUOTED)) ==
-			    EXP_QPAT && *p != '\\')
-				break;
-
 			goto addquote;
 		case CTLVAR:
 			p = evalvar(p, flag | inquotes);
@@ -651,8 +641,7 @@ subevalvar(char *p, char *str, int strloc, int subtype, int startloc, int varfla
 	char *(*scan)(char *, char *, char *, char *, int , int);
 
 	argstr(p, EXP_TILDE | (subtype != VSASSIGN && subtype != VSQUESTION ?
-			       (flag & (EXP_QUOTED | EXP_QPAT) ?
-			        EXP_QPAT : EXP_CASE) : 0));
+			       EXP_CASE : 0));
 	STPUTC('\0', expdest);
 	argbackq = saveargbackq;
 	startp = stackblock() + startloc;
@@ -1644,7 +1633,6 @@ char *
 _rmescapes(char *str, int flag)
 {
 	char *p, *q, *r;
-	unsigned inquotes;
 	int notescaped;
 	int globbing;
 
@@ -1674,24 +1662,23 @@ _rmescapes(char *str, int flag)
 			q = mempcpy(q, str, len);
 		}
 	}
-	inquotes = 0;
 	globbing = flag & RMESCAPE_GLOB;
 	notescaped = globbing;
 	while (*p) {
 		if (*p == (char)CTLQUOTEMARK) {
-			inquotes = ~inquotes;
 			p++;
 			notescaped = globbing;
 			continue;
 		}
+		if (*p == '\\') {
+			/* naked back slash */
+			notescaped = 0;
+			goto copy;
+		}
 		if (*p == (char)CTLESC) {
 			p++;
 			if (notescaped)
 				*q++ = '\\';
-		} else if (*p == '\\' && !inquotes) {
-			/* naked back slash */
-			notescaped = 0;
-			goto copy;
 		}
 		notescaped = globbing;
 copy:
diff --git a/src/expand.h b/src/expand.h
index 26dc5b4..90f5328 100644
--- a/src/expand.h
+++ b/src/expand.h
@@ -55,7 +55,6 @@ struct arglist {
 #define	EXP_VARTILDE	0x4	/* expand tildes in an assignment */
 #define	EXP_REDIR	0x8	/* file glob for a redirection (1 match only) */
 #define EXP_CASE	0x10	/* keeps quotes around for CASE pattern */
-#define EXP_QPAT	0x20	/* pattern in quoted parameter expansion */
 #define EXP_VARTILDE2	0x40	/* expand tildes after colons only */
 #define EXP_WORD	0x80	/* expand word in parameter expansion */
 #define EXP_QUOTED	0x100	/* expand word in double quotes */
diff --git a/src/parser.c b/src/parser.c
index 382658e..17d60b0 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -827,7 +827,8 @@ xxreadtoken(void)
 		}
 	}
 breakloop:
-	return readtoken1(c, BASESYNTAX, (char *)NULL, 0);
+	readtoken1(c, BASESYNTAX, (char *)NULL, 0);
+	return lasttoken;
 #undef RETURN
 }
 
@@ -855,47 +856,147 @@ static int pgetc_eatbnl(void)
  * word which marks the end of the document and striptabs is true if
  * leading tabs should be stripped from the document.  The argument firstc
  * is the first character of the input token or document.
- *
+ */
+
+STATIC char *readtoken1_loop(char *out, int c, char const *syntax, char *eofmark, int striptabs, int type);
+
+STATIC int
+readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs)
+{
+	char *out;
+	size_t len;
+	int c;
+	int fd;
+	union node *np;
+
+	quoteflag = 0;
+	backquotelist = NULL;
+	STARTSTACKSTR(out);
+	out = readtoken1_loop(out, firstc, syntax, eofmark, striptabs, 0);
+	USTPUTC('\0', out);
+	len = out - (char *)stackblock();
+	out = stackblock();
+	if (eofmark == NULL) {
+		c = pgetc();
+		if ((c == '>' || c == '<')
+		 && !quoteflag
+		 && len <= 2
+		 && (*out == '\0' || is_digit(*out))) {
+			goto parseredir;
+		} else {
+			pungetc();
+		}
+	}
+	grabstackblock(len);
+	wordtext = out;
+	return lasttoken = TWORD;
+
+/*
+ * Parse a redirection operator.  The variable "out" points to a string
+ * specifying the fd to be redirected.  The variable "c" contains the
+ * first character of the redirection operator.
+ */
+
+parseredir:
+	fd = *out;
+	np = (union node *)stalloc(sizeof (struct nfile));
+	if (c == '>') {
+		np->nfile.fd = 1;
+		c = pgetc();
+		if (c == '>')
+			np->type = NAPPEND;
+		else if (c == '|')
+			np->type = NCLOBBER;
+		else if (c == '&')
+			np->type = NTOFD;
+		else {
+			np->type = NTO;
+			pungetc();
+		}
+	} else {	/* c == '<' */
+		np->nfile.fd = 0;
+		switch (c = pgetc()) {
+		case '<':
+			if (sizeof (struct nfile) != sizeof (struct nhere)) {
+				np = (union node *)stalloc(sizeof (struct nhere));
+				np->nfile.fd = 0;
+			}
+			np->type = NHERE;
+			heredoc = (struct heredoc *)stalloc(sizeof (struct heredoc));
+			heredoc->here = np;
+			if ((c = pgetc()) == '-') {
+				heredoc->striptabs = 1;
+			} else {
+				heredoc->striptabs = 0;
+				pungetc();
+			}
+			break;
+
+		case '&':
+			np->type = NFROMFD;
+			break;
+
+		case '>':
+			np->type = NFROMTO;
+			break;
+
+		default:
+			np->type = NFROM;
+			pungetc();
+			break;
+		}
+	}
+	if (fd != '\0')
+		np->nfile.fd = digit_val(fd);
+	redirnode = np;
+	return lasttoken = TREDIR;
+}
+
+/*
+ * Main function. type determines what to do.
+ * 0: This is the outermost invocation. It reads a single word. Much is done
+ *    without recursion, but there are a few exceptions:
+ *    - Unquoted text inside quoted text, in ${x#y} variable substitutions.
+ *    - Arithmetic expressions inside variable substitutions.
+ *    - Arithmetic expressions inside parentheses inside arithmetic expressions.
+ *    If these are encountered, the function recurses.
+ * 1: Inside a variable substitution. ${x# has already been read. Continue reading until the closing }.
+ * 2: Inside an arithmetic substitution. $(( has already been read. Continue reading until the closing )).
+
  * Because C does not have internal subroutines, I have simulated them
  * using goto's to implement the subroutine linkage.  The following macros
  * will run code that appears at the end of readtoken1.
  */
 
 #define CHECKEND()	{goto checkend; checkend_return:;}
-#define PARSEREDIR()	{goto parseredir; parseredir_return:;}
 #define PARSESUB()	{goto parsesub; parsesub_return:;}
 #define PARSEBACKQOLD()	{oldstyle = 1; goto parsebackq; parsebackq_oldreturn:;}
 #define PARSEBACKQNEW()	{oldstyle = 0; goto parsebackq; parsebackq_newreturn:;}
 #define	PARSEARITH()	{goto parsearith; parsearith_return:;}
 
-STATIC int
-readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs)
+STATIC char *
+readtoken1_loop(char *out, int c, char const *syntax, char *eofmark, int striptabs, int type)
 {
-	int c = firstc;
-	char *out;
-	size_t len;
-	struct nodelist *bqlist;
-	int quotef;
 	int dblquote;
 	int varnest;	/* levels of variables expansion */
 	int arinest;	/* levels of arithmetic expansion */
 	int parenlevel;	/* levels of parens in arithmetic */
 	int dqvarnest;	/* levels of variables expansion within double quotes */
+	int innerdq;	/* double quotes within variable expansion within double quotes */
 	int oldstyle;
-	/* syntax before arithmetic */
-	char const *uninitialized_var(prevsyntax);
 
 	dblquote = 0;
-	if (syntax == DQSYNTAX)
-		dblquote = 1;
-	quotef = 0;
-	bqlist = NULL;
-	varnest = 0;
-	arinest = 0;
+	varnest = type == 1;
+	arinest = type == 2;
 	parenlevel = 0;
 	dqvarnest = 0;
+	innerdq = 0;
+
+	if (syntax == DQSYNTAX) {
+		dblquote = 1;
+		dqvarnest = varnest;
+	}
 
-	STARTSTACKSTR(out);
 	loop: {	/* for each line, until end of word */
 #if ATTY
 		if (c == '\034' && doprompt
@@ -941,20 +1042,25 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs)
 						c != '\\' && c != '`' &&
 						c != '$' && (
 							c != '"' ||
-							eofmark != NULL
+							(eofmark != NULL && !varnest)
+						) && (
+							c != '}' ||
+							!varnest ||
+							(dqvarnest ? innerdq : dblquote)
 						)
 					) {
+						USTPUTC(CTLESC, out);
 						USTPUTC('\\', out);
 					}
 					USTPUTC(CTLESC, out);
 					USTPUTC(c, out);
-					quotef++;
+					quoteflag++;
 				}
 				break;
 			case CSQUOTE:
 				syntax = SQSYNTAX;
 quotemark:
-				if (eofmark == NULL) {
+				if (eofmark == NULL || varnest) {
 					USTPUTC(CTLQUOTEMARK, out);
 				}
 				break;
@@ -965,12 +1071,12 @@ quotemark:
 			case CENDQUOTE:
 				if (eofmark && !varnest)
 					USTPUTC(c, out);
+				else if (dqvarnest)
+					innerdq ^= 1;
 				else {
-					if (dqvarnest == 0) {
-						syntax = BASESYNTAX;
-						dblquote = 0;
-					}
-					quotef++;
+					syntax = BASESYNTAX;
+					dblquote = 0;
+					quoteflag++;
 					goto quotemark;
 				}
 				break;
@@ -978,12 +1084,15 @@ quotemark:
 				PARSESUB();		/* parse substitution */
 				break;
 			case CENDVAR:	/* '}' */
-				if (varnest > 0) {
-					varnest--;
+				if (varnest > 0 && (dqvarnest ? !innerdq : !dblquote)) {
+					USTPUTC(CTLENDVAR, out);
+					if (!--varnest) {
+						if (type == 1)
+							return out;
+					}
 					if (dqvarnest > 0) {
 						dqvarnest--;
 					}
-					USTPUTC(CTLENDVAR, out);
 				} else {
 					USTPUTC(c, out);
 				}
@@ -999,8 +1108,11 @@ quotemark:
 				} else {
 					if (pgetc() == ')') {
 						USTPUTC(CTLENDARI, out);
-						if (!--arinest)
-							syntax = prevsyntax;
+						if (!--arinest) {
+							if (type == 2)
+								return out;
+							syntax = dblquote ? DQSYNTAX : BASESYNTAX;
+						}
 					} else {
 						/*
 						 * unbalanced parens
@@ -1029,33 +1141,18 @@ quotemark:
 		}
 	}
 endword:
-	if (syntax == ARISYNTAX)
-		synerror("Missing '))'");
-	if (syntax != BASESYNTAX && eofmark == NULL)
-		synerror("Unterminated quoted string");
-	if (varnest != 0) {
-		/* { */
-		synerror("Missing '}'");
-	}
-	USTPUTC('\0', out);
-	len = out - (char *)stackblock();
-	out = stackblock();
-	if (eofmark == NULL) {
-		if ((c == '>' || c == '<')
-		 && quotef == 0
-		 && len <= 2
-		 && (*out == '\0' || is_digit(*out))) {
-			PARSEREDIR();
-			return lasttoken = TREDIR;
-		} else {
-			pungetc();
+	if (!type) {
+		if (syntax == ARISYNTAX)
+			synerror("Missing '))'");
+		if (syntax != BASESYNTAX && eofmark == NULL)
+			synerror("Unterminated quoted string");
+		if (varnest != 0) {
+			/* { */
+			synerror("Missing '}'");
 		}
 	}
-	quoteflag = quotef;
-	backquotelist = bqlist;
-	grabstackblock(len);
-	wordtext = out;
-	return lasttoken = TWORD;
+	pungetc();
+	return out;
 /* end of readtoken routine */
 
 
@@ -1119,70 +1216,6 @@ more_heredoc:
 }
 
 
-/*
- * Parse a redirection operator.  The variable "out" points to a string
- * specifying the fd to be redirected.  The variable "c" contains the
- * first character of the redirection operator.
- */
-
-parseredir: {
-	char fd = *out;
-	union node *np;
-
-	np = (union node *)stalloc(sizeof (struct nfile));
-	if (c == '>') {
-		np->nfile.fd = 1;
-		c = pgetc();
-		if (c == '>')
-			np->type = NAPPEND;
-		else if (c == '|')
-			np->type = NCLOBBER;
-		else if (c == '&')
-			np->type = NTOFD;
-		else {
-			np->type = NTO;
-			pungetc();
-		}
-	} else {	/* c == '<' */
-		np->nfile.fd = 0;
-		switch (c = pgetc()) {
-		case '<':
-			if (sizeof (struct nfile) != sizeof (struct nhere)) {
-				np = (union node *)stalloc(sizeof (struct nhere));
-				np->nfile.fd = 0;
-			}
-			np->type = NHERE;
-			heredoc = (struct heredoc *)stalloc(sizeof (struct heredoc));
-			heredoc->here = np;
-			if ((c = pgetc()) == '-') {
-				heredoc->striptabs = 1;
-			} else {
-				heredoc->striptabs = 0;
-				pungetc();
-			}
-			break;
-
-		case '&':
-			np->type = NFROMFD;
-			break;
-
-		case '>':
-			np->type = NFROMTO;
-			break;
-
-		default:
-			np->type = NFROM;
-			pungetc();
-			break;
-		}
-	}
-	if (fd != '\0')
-		np->nfile.fd = digit_val(fd);
-	redirnode = np;
-	goto parseredir_return;
-}
-
-
 /*
  * Parse a substitution.  At this point, we have read the dollar sign
  * and nothing else.
@@ -1210,6 +1243,8 @@ parsesub: {
 			PARSEBACKQNEW();
 		}
 	} else {
+		const char *newsyntax = syntax == BASESYNTAX ? BASESYNTAX : DQSYNTAX;
+
 		USTPUTC(CTLVAR, out);
 		typeloc = out - (char *)stackblock();
 		STADJUST(1, out);
@@ -1282,6 +1317,7 @@ varname:
 						subtype++;
 					else
 						pungetc();
+					newsyntax = BASESYNTAX;
 					break;
 				}
 			}
@@ -1290,12 +1326,15 @@ badsub:
 			pungetc();
 		}
 		*((char *)stackblock() + typeloc) = subtype;
+		STPUTC('=', out);
 		if (subtype != VSNORMAL) {
-			varnest++;
-			if (dblquote)
-				dqvarnest++;
+			if (newsyntax == syntax) {
+				varnest++;
+				if (dblquote)
+					dqvarnest++;
+			} else
+				out = readtoken1_loop(out, pgetc(), newsyntax, eofmark, striptabs, 1);
 		}
-		STPUTC('=', out);
 	}
 	goto parsesub_return;
 }
@@ -1380,7 +1419,7 @@ done:
 			setinputstring(pstr);
                 }
         }
-	nlpp = &bqlist;
+	nlpp = &backquotelist;
 	while (*nlpp)
 		nlpp = &(*nlpp)->next;
 	*nlpp = (struct nodelist *)stalloc(sizeof (struct nodelist));
@@ -1391,7 +1430,9 @@ done:
 		doprompt = 0;
 	}
 
+	struct nodelist *savebqlist = backquotelist;
 	n = list(2);
+	backquotelist = savebqlist;
 
 	if (oldstyle)
 		doprompt = saveprompt;
@@ -1427,12 +1468,13 @@ done:
  * Parse an arithmetic expansion (indicate start of one and set state)
  */
 parsearith: {
-
-	if (++arinest == 1) {
-		prevsyntax = syntax;
+	USTPUTC(CTLARI, out);
+	if (varnest || parenlevel) {
+		out = readtoken1_loop(out, pgetc(), ARISYNTAX, eofmark, striptabs, 2);
+	} else {
 		syntax = ARISYNTAX;
+		++arinest;
 	}
-	USTPUTC(CTLARI, out);
 	goto parsearith_return;
 }