Dash Archive mirror
 help / color / mirror / Atom feed
From: Herbert Xu <herbert@gondor.apana.org.au>
To: DASH Mailing List <dash@vger.kernel.org>
Subject: [v3 PATCH 09/13] parser: Add support for multi-byte characters
Date: Sun, 05 May 2024 17:14:45 +0800	[thread overview]
Message-ID: <c943cd4d1e762ad7a48fb8697aacbd037a5fba69.1714900377.git.herbert@gondor.apana.org.au> (raw)
In-Reply-To: <cover.1714900377.git.herbert@gondor.apana.org.au>

Add the requisite markers for multi-byte characters so that the
expansion code can recognise them.  Also allow wide blank characters
to terminate words.

Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
 src/expand.c |  19 +++++++
 src/mktokens |   1 +
 src/parser.c | 136 +++++++++++++++++++++++++++++++++++++++++----------
 3 files changed, 129 insertions(+), 27 deletions(-)

diff --git a/src/expand.c b/src/expand.c
index dd2b71e..402289f 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -265,6 +265,7 @@ static char *argstr(char *p, int flag)
 		CTLESC,
 		CTLVAR,
 		CTLBACKQ,
+		CTLMBCHAR,
 		CTLARI,
 		CTLENDARI,
 		0
@@ -289,6 +290,8 @@ tilde:
 start:
 	startloc = expdest - (char *)stackblock();
 	for (;;) {
+		unsigned ml;
+		unsigned mb;
 		int end;
 
 		length += strcspn(p + length, reject);
@@ -351,6 +354,22 @@ addquote:
 				startloc++;
 			}
 			break;
+		case CTLMBCHAR:
+			c = (signed char)*p--;
+			mb = mbnext(p);
+			ml = (mb >> 8) - 2;
+			if (flag & QUOTES_ESC) {
+				length = (mb >> 8) + (mb & 0xff);
+				if (c == (char)CTLESC)
+					startloc += length;
+				break;
+			}
+			if (c == CTLESC)
+				startloc += ml;
+			p += mb & 0xff;
+			expdest = stnputs(p, ml, expdest);
+			p += mb >> 8;
+			break;
 		case CTLESC:
 			startloc++;
 			length++;
diff --git a/src/mktokens b/src/mktokens
index 78055be..dcef676 100644
--- a/src/mktokens
+++ b/src/mktokens
@@ -41,6 +41,7 @@
 
 cat > "${TMPDIR}"/ka$$ <<\!
 TEOF	1	end of file
+TBLANK	0	blank
 TNL	0	newline
 TSEMI	0	";"
 TBACKGND 0	"&"
diff --git a/src/parser.c b/src/parser.c
index 27611f0..71d61f3 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -36,7 +36,11 @@
 #include <alloca.h>
 #endif
 
+#include <limits.h>
+#include <stdbool.h>
 #include <stdlib.h>
+#include <wchar.h>
+#include <wctype.h>
 
 #include "shell.h"
 #include "parser.h"
@@ -801,6 +805,8 @@ xxreadtoken(void)
 		setprompt(2);
 	}
 	for (;;) {	/* until token or start of word found */
+		int tok;
+
 		c = pgetc_eatbnl();
 		switch (c) {
 		case ' ': case '\t':
@@ -834,9 +840,10 @@ xxreadtoken(void)
 		case ')':
 			RETURN(TRP);
 		}
-		break;
+		tok = readtoken1(c, BASESYNTAX, (char *)NULL, 0);
+		if (tok != TBLANK)
+			return tok;
 	}
-	return readtoken1(c, BASESYNTAX, (char *)NULL, 0);
 #undef RETURN
 }
 
@@ -876,7 +883,53 @@ static void synstack_pop(struct synstack **stack)
 	*stack = (*stack)->next;
 }
 
+static unsigned getmbc(int c, char *out, int mode)
+{
+	char *const start = out;
+	mbstate_t mbst = {};
+	unsigned ml = 0;
+	size_t ml2;
+	wchar_t wc;
+	char *mbc;
 
+	if (likely(c >= 0))
+		return 0;
+
+	mbc = (mode & 3) < 2 ? out + 2 + (mode == 1) : out;
+	mbc[ml] = c;
+	while ((ml2 = mbrtowc(&wc, mbc + ml++, 1, &mbst)) == -2) {
+		if (ml >= MB_LEN_MAX)
+			break;
+		c = pgetc_eoa();
+		if (c == PEOA || c == PEOF)
+			break;
+		mbc[ml] = c;
+	}
+
+	if (ml2 == 1 && ml > 1) {
+		if (mode == 4 && iswblank(wc))
+			return 1;
+
+		if ((mode & 3) < 2) {
+			USTPUTC(CTLMBCHAR, out);
+			if (mode == 1)
+				USTPUTC(CTLESC, out);
+			USTPUTC(ml, out);
+		}
+		STADJUST(ml, out);
+		if ((mode & 3) < 2) {
+			USTPUTC(ml, out);
+			USTPUTC(CTLMBCHAR, out);
+		}
+
+		return out - start;
+	}
+
+	if (ml > 1)
+		pungetn(ml - 1);
+
+	return 0;
+}
 
 /*
  * If eofmark is NULL, read a word or a redirection symbol.  If eofmark
@@ -929,12 +982,29 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs)
 		}
 #endif
 		CHECKEND();	/* set c to PEOF if at end of here document */
-		for (;;) {	/* until end of line or end of word */
-			CHECKSTRSPACE(4, out);	/* permit 4 calls to USTPUTC */
+		/* Until end of line or end of word */
+		for (;; c = pgetc_top(synstack)) {
+			int fieldsplitting;
+			unsigned ml;
+
+			/* Permit max(MB_LEN_MAX, 23) calls to USTPUTC. */
+			CHECKSTRSPACE((MB_LEN_MAX > 16 ? MB_LEN_MAX : 16) + 7,
+				      out);
+			fieldsplitting = synstack->syntax == BASESYNTAX &&
+					 !synstack->varnest ? 4 : 0;
+			ml = getmbc(c, out, fieldsplitting);
+			if (ml == 1) {
+				if (out == stackblock())
+					return TBLANK;
+				c = pgetc();
+				break;
+			}
+			out += ml;
+			if (ml)
+				continue;
 			switch(synstack->syntax[c]) {
 			case CNL:	/* '\n' */
-				if (synstack->syntax == BASESYNTAX &&
-				    !synstack->varnest)
+				if (fieldsplitting)
 					goto endword;	/* exit outer loop */
 				USTPUTC(c, out);
 				nlprompt();
@@ -956,26 +1026,33 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs)
 					USTPUTC(CTLESC, out);
 					USTPUTC('\\', out);
 					pungetc();
-				} else {
-					if (
-						synstack->dblquote &&
-						c != '\\' && c != '`' &&
-						c != '$' && (
-							c != '"' ||
-							(eofmark != NULL &&
-							 !synstack->varnest)
-						) && (
-							c != '}' ||
-							!synstack->varnest
-						)
-					) {
-						USTPUTC(CTLESC, out);
-						USTPUTC('\\', out);
-					}
-					USTPUTC(CTLESC, out);
-					USTPUTC(c, out);
-					quotef++;
+					break;
 				}
+
+				if (
+					synstack->dblquote &&
+					c != '\\' && c != '`' &&
+					c != '$' && (
+						c != '"' ||
+						(eofmark != NULL &&
+						 !synstack->varnest)
+					) && (
+						c != '}' ||
+						!synstack->varnest
+					)
+				) {
+					USTPUTC(CTLESC, out);
+					USTPUTC('\\', out);
+				}
+				quotef++;
+
+				ml = getmbc(c, out, 1);
+				out += ml;
+				if (ml)
+					break;
+
+				USTPUTC(CTLESC, out);
+				USTPUTC(c, out);
 				break;
 			case CSQUOTE:
 				synstack->syntax = SQSYNTAX;
@@ -1053,11 +1130,10 @@ toggledq:
 			case CEOF:
 				goto endword;		/* exit outer loop */
 			default:
-				if (synstack->varnest == 0)
+				if (fieldsplitting)
 					goto endword;	/* exit outer loop */
 				USTPUTC(c, out);
 			}
-			c = pgetc_top(synstack);
 		}
 	}
 endword:
@@ -1384,6 +1460,7 @@ parsebackq: {
 	size_t psavelen;
 	size_t savelen;
 	union node *n;
+	unsigned ml;
 	char *pstr;
 	char *str;
 
@@ -1415,6 +1492,11 @@ parsebackq: {
                                 if (pc != '\\' && pc != '`' && pc != '$'
                                     && (!synstack->dblquote || pc != '"'))
                                         STPUTC('\\', pout);
+				CHECKSTRSPACE(MB_LEN_MAX, pout);
+				ml = getmbc(pc, pout, 2);
+				pout += ml;
+				if (ml)
+					continue;
 				break;
 
 			case PEOF:
-- 
2.39.2


  parent reply	other threads:[~2024-05-05  9:14 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-05-05  9:14 [v3 PATCH 00/13] Add multi-byte support Herbert Xu
2024-05-05  9:14 ` [v3 PATCH 01/13] shell: Call setlocale Herbert Xu
2024-05-05  9:14 ` [v3 PATCH 02/13] shell: Use strcoll instead of strcmp where applicable Herbert Xu
2024-05-05  9:14 ` [v3 PATCH 03/13] expand: Count multi-byte characters for VSLENGTH Herbert Xu
2024-05-05  9:14 ` [v3 PATCH 04/13] expand: Process multi-byte characters in subevalvar Herbert Xu
2024-05-05  9:14 ` [v3 PATCH 05/13] expand: Process multi-byte characters in expmeta Herbert Xu
2024-05-05  9:14 ` [v3 PATCH 06/13] expand: Support multi-byte characters during field splitting Herbert Xu
2024-05-05  9:14 ` [v3 PATCH 07/13] input: Allow MB_LEN_MAX calls to pungetc Herbert Xu
2024-05-05  9:14 ` [v3 PATCH 08/13] input: Add pgetc_eoa Herbert Xu
2024-05-05  9:14 ` Herbert Xu [this message]
2024-05-05  9:15 ` [v3 PATCH 10/13] input: Always push in setinputfile Herbert Xu
2024-05-05  9:15 ` [v3 PATCH 11/13] memalloc: Use void * instead of pointer Herbert Xu
2024-05-05  9:15 ` [v3 PATCH 12/13] builtin: Use pgetc in read(1) Herbert Xu
2024-05-05  9:15 ` [v3 PATCH 13/13] builtin: Process multi-byte characters " Herbert Xu

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=c943cd4d1e762ad7a48fb8697aacbd037a5fba69.1714900377.git.herbert@gondor.apana.org.au \
    --to=herbert@gondor.apana.org.au \
    --cc=dash@vger.kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).