[nasm:master] preproc: when parsing a # marker, use C-style string unquoting

nasm-bot for H. Peter Anvin (Intel) hpa at zytor.com
Mon Jul 13 14:15:05 PDT 2020


Commit-ID:  1d151a8558f1ba7ef971a3b5af960db0031a0383
Gitweb:     http://repo.or.cz/w/nasm.git?a=commitdiff;h=1d151a8558f1ba7ef971a3b5af960db0031a0383
Author:     H. Peter Anvin (Intel) <hpa at zytor.com>
AuthorDate: Mon, 13 Jul 2020 14:10:16 -0700
Committer:  H. Peter Anvin (Intel) <hpa at zytor.com>
CommitDate: Mon, 13 Jul 2020 14:14:28 -0700

preproc: when parsing a # marker, use C-style string unquoting

To handle escape codes in filename strings after # markers correctly,
we need nasm_unquote() to be aware that it is using C escapes;
otherwise things like "foo`bar" will break.

Signed-off-by: H. Peter Anvin (Intel) <hpa at zytor.com>


---
 asm/preproc.c | 71 ++++++++++++++++++++++++-------------------------------
 asm/quote.c   | 75 ++++++++++++++++++++++++++++++++---------------------------
 asm/quote.h   | 15 ++++++++++++
 3 files changed, 87 insertions(+), 74 deletions(-)

diff --git a/asm/preproc.c b/asm/preproc.c
index 6a71ad00..0dfde9a5 100644
--- a/asm/preproc.c
+++ b/asm/preproc.c
@@ -721,30 +721,37 @@ static inline bool tok_isnt(const Token *x, char c)
  * Unquote a token if it is a string, and set its type to
  * TOK_INTERNAL_STRING.
  */
-static const char *unquote_token(Token *t)
+
+/*
+ * Common version for any kind of quoted string; see asm/quote.c for
+ * information about the arguments.
+ */
+static const char *unquote_token_anystr(Token *t, uint32_t badctl, char qstart)
 {
+    size_t nlen, olen;
+    char *p;
+
     if (t->type != TOK_STRING)
 	return tok_text(t);
 
+    olen = t->len;
+    p = (olen > INLINE_TEXT) ? t->text.p.ptr : t->text.a;
+    t->len = nlen = nasm_unquote_anystr(p, NULL, badctl, qstart);
     t->type = TOK_INTERNAL_STRING;
 
-    if (t->len > INLINE_TEXT) {
-	char *p = t->text.p.ptr;
+    if (olen <= INLINE_TEXT || nlen > INLINE_TEXT)
+        return p;
 
-	t->len = nasm_unquote(p, NULL);
+    nasm_zero(t->text.a);
+    memcpy(t->text.a, p, nlen);
+    nasm_free(p);
+    return t->text.a;
+}
 
-	if (t->len <= INLINE_TEXT) {
-	    nasm_zero(t->text.a);
-	    memcpy(t->text.a, p, t->len);
-	    nasm_free(p);
-	    return t->text.a;
-	} else {
-	    return p;
-	}
-    } else {
-	t->len = nasm_unquote(t->text.a, NULL);
-	return t->text.a;
-    }
+/* Unquote any string, can produce any arbitrary binary output */
+static const char *unquote_token(Token *t)
+{
+    return unquote_token_anystr(t, 0, STR_NASM);
 }
 
 /*
@@ -753,28 +760,7 @@ static const char *unquote_token(Token *t)
  */
 static const char *unquote_token_cstr(Token *t)
 {
-    if (t->type != TOK_STRING)
-	return tok_text(t);
-
-    t->type = TOK_INTERNAL_STRING;
-
-    if (t->len > INLINE_TEXT) {
-	char *p = t->text.p.ptr;
-
-	t->len = nasm_unquote_cstr(p, NULL);
-
-	if (t->len <= INLINE_TEXT) {
-	    nasm_zero(t->text.a);
-	    memcpy(t->text.a, p, t->len);
-	    nasm_free(p);
-	    return t->text.a;
-	} else {
-	    return p;
-	}
-    } else {
-	t->len = nasm_unquote_cstr(t->text.a, NULL);
-	return t->text.a;
-    }
+    return unquote_token_anystr(t, BADCTL, STR_NASM);
 }
 
 /*
@@ -3389,14 +3375,19 @@ static int line_directive(Token *origline, Token *tline)
     tline = skip_white(tline);
     if (tline) {
         if (tline->type == TOK_STRING) {
+            const char *fname;
             /*
              * If this is a quoted string, ignore anything after
              * it; this allows for compatiblity with gcc's
              * additional flags options.
              */
-            src_set_fname(unquote_token(tline));
+
+            fname = unquote_token_anystr(tline, BADCTL,
+                                          dname[0] == '#' ? STR_C : STR_NASM);
+            src_set_fname(fname);
         } else {
-            char *fname = detoken(tline, false);
+            char *fname;
+            fname = detoken(tline, false);
             src_set_fname(fname);
             nasm_free(fname);
         }
diff --git a/asm/quote.c b/asm/quote.c
index 58bb5a10..301abed7 100644
--- a/asm/quote.c
+++ b/asm/quote.c
@@ -1,6 +1,6 @@
 /* ----------------------------------------------------------------------- *
  *
- *   Copyright 1996-2019 The NASM Authors - All Rights Reserved
+ *   Copyright 1996-2020 The NASM Authors - All Rights Reserved
  *   See the file AUTHORS included with the NASM distribution for
  *   the specific copyright holders.
  *
@@ -291,10 +291,17 @@ char *nasm_quote_cstr(const char *str, size_t *lenp)
  * corresponding to bits set in badctl; in that case, the output
  * string, but not *ep, is truncated before the first invalid
  * character.
+ *
+ * badctl is a bitmask of control characters (0-31) which are forbidden
+ * from appearing in the final output.
+ *
+ * The qstart character can be either '`' (NASM style) or '\"' (C style),
+ * to indicate the lead marker of a quoted string. If it is '\"', then
+ * '`' is not a special character at all.
  */
 
-static size_t nasm_unquote_common(char *str, char **ep,
-                                  const uint32_t badctl)
+size_t nasm_unquote_anystr(char *str, char **ep, const uint32_t badctl,
+                           const char qstart)
 {
     unsigned char bq;
     const unsigned char *p;
@@ -319,15 +326,7 @@ static size_t nasm_unquote_common(char *str, char **ep,
     if (!bq)
 	return 0;
 
-    switch (bq) {
-    case '\'':
-    case '\"':
-	/* '...' or "..." string */
-        while ((c = *p++) && (c != bq))
-            EMIT(c);
-	break;
-
-    case '`':
+    if (bq == (unsigned char)qstart) {
 	/* `...` string */
 	state = st_start;
 
@@ -335,18 +334,13 @@ static size_t nasm_unquote_common(char *str, char **ep,
 	    c = *p++;
 	    switch (state) {
 	    case st_start:
-		switch (c) {
-		case '\\':
+                if (c == '\\') {
 		    state = st_backslash;
-		    break;
-		case '`':
-                case '\0':
+                } else if ((c == '\0') | (c == bq)) {
                     state = st_done;
-                    break;
-		default:
+                } else {
                     EMIT(c);
-		    break;
-		}
+                }
 		break;
 
 	    case st_backslash:
@@ -450,14 +444,19 @@ static size_t nasm_unquote_common(char *str, char **ep,
             default:
                 panic();
             }
-    }
-    break;
-
-    default:
+        }
+    } else if (bq == '\'' || bq == '\"') {
+	/*
+         * '...' or "..." string, NASM legacy style (no escapes of
+         * * any kind, including collapsing double quote marks.)
+         * We obviously can't get here if qstart == '\"'.
+         */
+        while ((c = *p++) && (c != bq))
+            EMIT(c);
+    } else {
 	/* Not a quoted string, just return the input... */
         while ((c = *p++))
             EMIT(c);
-	break;
     }
 
     /* Zero-terminate the output */
@@ -472,24 +471,30 @@ static size_t nasm_unquote_common(char *str, char **ep,
 }
 #undef EMIT
 
+/*
+ * Unquote any arbitrary string; may produce any bytes, including embedded
+ * control- and NUL characters.
+ */
 size_t nasm_unquote(char *str, char **ep)
 {
-    return nasm_unquote_common(str, ep, 0);
+    return nasm_unquote_anystr(str, ep, 0, STR_NASM);
 }
+
+/*
+ * Unquote a string indended to be used as a C string; most control
+ * characters are rejected, including whitespace characters that
+ * would imply line endings and so on.
+ */
 size_t nasm_unquote_cstr(char *str, char **ep)
 {
-    /*
-     * These are the only control characters permitted: BEL BS TAB ESC
-     */
-    const uint32_t okctl = (1 << '\a') | (1 << '\b') | (1 << '\t') | (1 << 27);
-
-    return nasm_unquote_common(str, ep, ~okctl);
+    return nasm_unquote_anystr(str, ep, BADCTL, STR_NASM);
 }
 
 /*
  * Find the end of a quoted string; returns the pointer to the terminating
  * character (either the ending quote or the null character, if unterminated.)
  * If the input is not a quoted string, return NULL.
+ * This applies to NASM style strings only.
  */
 char *nasm_skip_string(const char *str)
 {
@@ -537,7 +542,9 @@ char *nasm_skip_string(const char *str)
 		 * Note: for the purpose of finding the end of the string,
 		 * all successor states to st_backslash are functionally
 		 * equivalent to st_start, since either a backslash or
-		 * a backquote will force a return to the st_start state.
+		 * a backquote will force a return to the st_start state,
+                 * and any possible multi-character state will terminate
+                 * for any non-alphanumeric character.
 		 */
 		state = c ? st_start : st_done;
 		break;
diff --git a/asm/quote.h b/asm/quote.h
index 7259f7cd..d8226cdb 100644
--- a/asm/quote.h
+++ b/asm/quote.h
@@ -38,9 +38,24 @@
 
 char *nasm_quote(const char *str, size_t *len);
 char *nasm_quote_cstr(const char *str, size_t *len);
+size_t nasm_unquote_anystr(char *str, char **endptr,
+                           uint32_t badctl, char qstart);
 size_t nasm_unquote(char *str, char **endptr);
 size_t nasm_unquote_cstr(char *str, char **endptr);
 char *nasm_skip_string(const char *str);
 
+/* Arguments used with nasm_quote_anystr() */
+
+/*
+ * These are the only control characters when we produce a C string:
+ * BEL BS TAB ESC
+ */
+#define OKCTL ((1U << '\a') | (1U << '\b') | (1U << '\t') | (1U << 27))
+#define BADCTL (~(uint32_t)OKCTL)
+
+/* Initial quotation mark */
+#define STR_C    '\"'
+#define STR_NASM '`'
+
 #endif /* NASM_QUOTE_H */
 


More information about the Nasm-commits mailing list