diff options
author | Denys Vlasenko | 2020-12-02 19:07:31 +0100 |
---|---|---|
committer | Denys Vlasenko | 2020-12-02 19:07:31 +0100 |
commit | 665a65953076ea21be49250b8279ddb1f0f99f38 (patch) | |
tree | bfb46738da9fec6715843197b5987ad56d4fcf76 | |
parent | 50ead33c45919abffde35313daac4c2dfd8641ca (diff) | |
download | busybox-665a65953076ea21be49250b8279ddb1f0f99f38.zip busybox-665a65953076ea21be49250b8279ddb1f0f99f38.tar.gz |
awk: FS regex matches only non-empty separators (gawk compat)
function old new delta
awk_split 484 553 +69
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | editors/awk.c | 33 | ||||
-rwxr-xr-x | testsuite/awk.tests | 7 |
2 files changed, 32 insertions, 8 deletions
diff --git a/editors/awk.c b/editors/awk.c index d56d633..2c15f9e 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -1763,6 +1763,29 @@ static void fsrealloc(int size) nfields = size; } +static int regexec1_nonempty(const regex_t *preg, const char *s, regmatch_t pmatch[]) +{ + int r = regexec(preg, s, 1, pmatch, 0); + if (r == 0 && pmatch[0].rm_eo == 0) { + /* For example, happens when FS can match + * an empty string (awk -F ' *'). Logically, + * this should split into one-char fields. + * However, gawk 5.0.1 searches for first + * _non-empty_ separator string match: + */ + size_t ofs = 0; + do { + ofs++; + if (!s[ofs]) + return REG_NOMATCH; + regexec(preg, s + ofs, 1, pmatch, 0); + } while (pmatch[0].rm_eo == 0); + pmatch[0].rm_so += ofs; + pmatch[0].rm_eo += ofs; + } + return r; +} + static int awk_split(const char *s, node *spl, char **slist) { int n; @@ -1788,17 +1811,11 @@ static int awk_split(const char *s, node *spl, char **slist) regmatch_t pmatch[2]; // TODO: why [2]? [1] is enough... l = strcspn(s, c+2); /* len till next NUL or \n */ - if (regexec(icase ? spl->r.ire : spl->l.re, s, 1, pmatch, 0) == 0 + if (regexec1_nonempty(icase ? spl->r.ire : spl->l.re, s, pmatch) == 0 && pmatch[0].rm_so <= l ) { + /* if (pmatch[0].rm_eo == 0) ... - impossible */ l = pmatch[0].rm_so; - if (pmatch[0].rm_eo == 0) { - /* For example, happens when FS can match - * an empthy string (awk -F ' *') - */ - l++; - pmatch[0].rm_eo++; - } n++; /* we saw yet another delimiter */ } else { pmatch[0].rm_eo = l; diff --git a/testsuite/awk.tests b/testsuite/awk.tests index 87f6b50..06a531d 100755 --- a/testsuite/awk.tests +++ b/testsuite/awk.tests @@ -398,5 +398,12 @@ testing 'awk do not allow "str"++' \ '' \ 'anything' +#gawk compat: FS regex matches only non-empty separators: +#with -*, the splitting is NOT f o o b a r, but foo bar: +testing 'awk FS regex which can match empty string' \ + "awk -F '-*' '{print \$1 \"-\" \$2 \"=\" \$3 \"*\" \$4}'" \ + "foo-bar=*\n" \ + '' \ + 'foo--bar' exit $FAILCOUNT |