From: Herbert Xu <herbert@gondor.apana.org.au>
To: DASH Mailing List <dash@vger.kernel.org>
Subject: [v3 PATCH 09/13] parser: Add support for multi-byte characters
Date: Sun, 05 May 2024 17:14:45 +0800 [thread overview]
Message-ID: <c943cd4d1e762ad7a48fb8697aacbd037a5fba69.1714900377.git.herbert@gondor.apana.org.au> (raw)
In-Reply-To: <cover.1714900377.git.herbert@gondor.apana.org.au>
Add the requisite markers for multi-byte characters so that the
expansion code can recognise them. Also allow wide blank characters
to terminate words.
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
---
src/expand.c | 19 +++++++
src/mktokens | 1 +
src/parser.c | 136 +++++++++++++++++++++++++++++++++++++++++----------
3 files changed, 129 insertions(+), 27 deletions(-)
diff --git a/src/expand.c b/src/expand.c
index dd2b71e..402289f 100644
--- a/src/expand.c
+++ b/src/expand.c
@@ -265,6 +265,7 @@ static char *argstr(char *p, int flag)
CTLESC,
CTLVAR,
CTLBACKQ,
+ CTLMBCHAR,
CTLARI,
CTLENDARI,
0
@@ -289,6 +290,8 @@ tilde:
start:
startloc = expdest - (char *)stackblock();
for (;;) {
+ unsigned ml;
+ unsigned mb;
int end;
length += strcspn(p + length, reject);
@@ -351,6 +354,22 @@ addquote:
startloc++;
}
break;
+ case CTLMBCHAR:
+ c = (signed char)*p--;
+ mb = mbnext(p);
+ ml = (mb >> 8) - 2;
+ if (flag & QUOTES_ESC) {
+ length = (mb >> 8) + (mb & 0xff);
+ if (c == (char)CTLESC)
+ startloc += length;
+ break;
+ }
+ if (c == CTLESC)
+ startloc += ml;
+ p += mb & 0xff;
+ expdest = stnputs(p, ml, expdest);
+ p += mb >> 8;
+ break;
case CTLESC:
startloc++;
length++;
diff --git a/src/mktokens b/src/mktokens
index 78055be..dcef676 100644
--- a/src/mktokens
+++ b/src/mktokens
@@ -41,6 +41,7 @@
cat > "${TMPDIR}"/ka$$ <<\!
TEOF 1 end of file
+TBLANK 0 blank
TNL 0 newline
TSEMI 0 ";"
TBACKGND 0 "&"
diff --git a/src/parser.c b/src/parser.c
index 27611f0..71d61f3 100644
--- a/src/parser.c
+++ b/src/parser.c
@@ -36,7 +36,11 @@
#include <alloca.h>
#endif
+#include <limits.h>
+#include <stdbool.h>
#include <stdlib.h>
+#include <wchar.h>
+#include <wctype.h>
#include "shell.h"
#include "parser.h"
@@ -801,6 +805,8 @@ xxreadtoken(void)
setprompt(2);
}
for (;;) { /* until token or start of word found */
+ int tok;
+
c = pgetc_eatbnl();
switch (c) {
case ' ': case '\t':
@@ -834,9 +840,10 @@ xxreadtoken(void)
case ')':
RETURN(TRP);
}
- break;
+ tok = readtoken1(c, BASESYNTAX, (char *)NULL, 0);
+ if (tok != TBLANK)
+ return tok;
}
- return readtoken1(c, BASESYNTAX, (char *)NULL, 0);
#undef RETURN
}
@@ -876,7 +883,53 @@ static void synstack_pop(struct synstack **stack)
*stack = (*stack)->next;
}
+static unsigned getmbc(int c, char *out, int mode)
+{
+ char *const start = out;
+ mbstate_t mbst = {};
+ unsigned ml = 0;
+ size_t ml2;
+ wchar_t wc;
+ char *mbc;
+ if (likely(c >= 0))
+ return 0;
+
+ mbc = (mode & 3) < 2 ? out + 2 + (mode == 1) : out;
+ mbc[ml] = c;
+ while ((ml2 = mbrtowc(&wc, mbc + ml++, 1, &mbst)) == -2) {
+ if (ml >= MB_LEN_MAX)
+ break;
+ c = pgetc_eoa();
+ if (c == PEOA || c == PEOF)
+ break;
+ mbc[ml] = c;
+ }
+
+ if (ml2 == 1 && ml > 1) {
+ if (mode == 4 && iswblank(wc))
+ return 1;
+
+ if ((mode & 3) < 2) {
+ USTPUTC(CTLMBCHAR, out);
+ if (mode == 1)
+ USTPUTC(CTLESC, out);
+ USTPUTC(ml, out);
+ }
+ STADJUST(ml, out);
+ if ((mode & 3) < 2) {
+ USTPUTC(ml, out);
+ USTPUTC(CTLMBCHAR, out);
+ }
+
+ return out - start;
+ }
+
+ if (ml > 1)
+ pungetn(ml - 1);
+
+ return 0;
+}
/*
* If eofmark is NULL, read a word or a redirection symbol. If eofmark
@@ -929,12 +982,29 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs)
}
#endif
CHECKEND(); /* set c to PEOF if at end of here document */
- for (;;) { /* until end of line or end of word */
- CHECKSTRSPACE(4, out); /* permit 4 calls to USTPUTC */
+ /* Until end of line or end of word */
+ for (;; c = pgetc_top(synstack)) {
+ int fieldsplitting;
+ unsigned ml;
+
+ /* Permit max(MB_LEN_MAX, 23) calls to USTPUTC. */
+ CHECKSTRSPACE((MB_LEN_MAX > 16 ? MB_LEN_MAX : 16) + 7,
+ out);
+ fieldsplitting = synstack->syntax == BASESYNTAX &&
+ !synstack->varnest ? 4 : 0;
+ ml = getmbc(c, out, fieldsplitting);
+ if (ml == 1) {
+ if (out == stackblock())
+ return TBLANK;
+ c = pgetc();
+ break;
+ }
+ out += ml;
+ if (ml)
+ continue;
switch(synstack->syntax[c]) {
case CNL: /* '\n' */
- if (synstack->syntax == BASESYNTAX &&
- !synstack->varnest)
+ if (fieldsplitting)
goto endword; /* exit outer loop */
USTPUTC(c, out);
nlprompt();
@@ -956,26 +1026,33 @@ readtoken1(int firstc, char const *syntax, char *eofmark, int striptabs)
USTPUTC(CTLESC, out);
USTPUTC('\\', out);
pungetc();
- } else {
- if (
- synstack->dblquote &&
- c != '\\' && c != '`' &&
- c != '$' && (
- c != '"' ||
- (eofmark != NULL &&
- !synstack->varnest)
- ) && (
- c != '}' ||
- !synstack->varnest
- )
- ) {
- USTPUTC(CTLESC, out);
- USTPUTC('\\', out);
- }
- USTPUTC(CTLESC, out);
- USTPUTC(c, out);
- quotef++;
+ break;
}
+
+ if (
+ synstack->dblquote &&
+ c != '\\' && c != '`' &&
+ c != '$' && (
+ c != '"' ||
+ (eofmark != NULL &&
+ !synstack->varnest)
+ ) && (
+ c != '}' ||
+ !synstack->varnest
+ )
+ ) {
+ USTPUTC(CTLESC, out);
+ USTPUTC('\\', out);
+ }
+ quotef++;
+
+ ml = getmbc(c, out, 1);
+ out += ml;
+ if (ml)
+ break;
+
+ USTPUTC(CTLESC, out);
+ USTPUTC(c, out);
break;
case CSQUOTE:
synstack->syntax = SQSYNTAX;
@@ -1053,11 +1130,10 @@ toggledq:
case CEOF:
goto endword; /* exit outer loop */
default:
- if (synstack->varnest == 0)
+ if (fieldsplitting)
goto endword; /* exit outer loop */
USTPUTC(c, out);
}
- c = pgetc_top(synstack);
}
}
endword:
@@ -1384,6 +1460,7 @@ parsebackq: {
size_t psavelen;
size_t savelen;
union node *n;
+ unsigned ml;
char *pstr;
char *str;
@@ -1415,6 +1492,11 @@ parsebackq: {
if (pc != '\\' && pc != '`' && pc != '$'
&& (!synstack->dblquote || pc != '"'))
STPUTC('\\', pout);
+ CHECKSTRSPACE(MB_LEN_MAX, pout);
+ ml = getmbc(pc, pout, 2);
+ pout += ml;
+ if (ml)
+ continue;
break;
case PEOF:
--
2.39.2
next prev parent reply other threads:[~2024-05-05 9:14 UTC|newest]
Thread overview: 14+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-05-05 9:14 [v3 PATCH 00/13] Add multi-byte support Herbert Xu
2024-05-05 9:14 ` [v3 PATCH 01/13] shell: Call setlocale Herbert Xu
2024-05-05 9:14 ` [v3 PATCH 02/13] shell: Use strcoll instead of strcmp where applicable Herbert Xu
2024-05-05 9:14 ` [v3 PATCH 03/13] expand: Count multi-byte characters for VSLENGTH Herbert Xu
2024-05-05 9:14 ` [v3 PATCH 04/13] expand: Process multi-byte characters in subevalvar Herbert Xu
2024-05-05 9:14 ` [v3 PATCH 05/13] expand: Process multi-byte characters in expmeta Herbert Xu
2024-05-05 9:14 ` [v3 PATCH 06/13] expand: Support multi-byte characters during field splitting Herbert Xu
2024-05-05 9:14 ` [v3 PATCH 07/13] input: Allow MB_LEN_MAX calls to pungetc Herbert Xu
2024-05-05 9:14 ` [v3 PATCH 08/13] input: Add pgetc_eoa Herbert Xu
2024-05-05 9:14 ` Herbert Xu [this message]
2024-05-05 9:15 ` [v3 PATCH 10/13] input: Always push in setinputfile Herbert Xu
2024-05-05 9:15 ` [v3 PATCH 11/13] memalloc: Use void * instead of pointer Herbert Xu
2024-05-05 9:15 ` [v3 PATCH 12/13] builtin: Use pgetc in read(1) Herbert Xu
2024-05-05 9:15 ` [v3 PATCH 13/13] builtin: Process multi-byte characters " Herbert Xu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=c943cd4d1e762ad7a48fb8697aacbd037a5fba69.1714900377.git.herbert@gondor.apana.org.au \
--to=herbert@gondor.apana.org.au \
--cc=dash@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).