diff options
author | Denys Vlasenko | 2023-06-08 10:42:39 +0200 |
---|---|---|
committer | Denys Vlasenko | 2023-06-08 10:42:39 +0200 |
commit | 2ca39ffd447ca874fcea933194829717d5573247 (patch) | |
tree | 6d9eb4ba80ad9feec70c3f4f25dd3f7629c5fe5a | |
parent | 113685fbcd4c3432ec9b640583d50ba8da2102e8 (diff) | |
download | busybox-2ca39ffd447ca874fcea933194829717d5573247.zip busybox-2ca39ffd447ca874fcea933194829717d5573247.tar.gz |
awk: fix subst code to handle "start of word" pattern correctly (needs REG_STARTEND)
function old new delta
awk_sub 637 714 +77
Signed-off-by: Denys Vlasenko <vda.linux@googlemail.com>
-rw-r--r-- | editors/awk.c | 49 | ||||
-rwxr-xr-x | testsuite/awk.tests | 28 |
2 files changed, 51 insertions, 26 deletions
diff --git a/editors/awk.c b/editors/awk.c index df9b7fd..171f0a7 100644 --- a/editors/awk.c +++ b/editors/awk.c @@ -2504,17 +2504,46 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest /*,in regex_t sreg, *regex; /* True only if called to implement gensub(): */ int subexp = (src != dest); - +#if defined(REG_STARTEND) + const char *src_string; + size_t src_strlen; + regexec_flags = REG_STARTEND; +#else + regexec_flags = 0; +#endif resbuf = NULL; residx = 0; match_no = 0; - regexec_flags = 0; regex = as_regex(rn, &sreg); sp = getvar_s(src ? src : intvar[F0]); +#if defined(REG_STARTEND) + src_string = sp; + src_strlen = strlen(src_string); +#endif replen = strlen(repl); - while (regexec(regex, sp, 10, pmatch, regexec_flags) == 0) { - int so = pmatch[0].rm_so; - int eo = pmatch[0].rm_eo; + for (;;) { + int so, eo; + +#if defined(REG_STARTEND) +// REG_STARTEND: "This flag is a BSD extension, not present in POSIX" + size_t start_ofs = sp - src_string; + pmatch[0].rm_so = start_ofs; + pmatch[0].rm_eo = src_strlen; + if (regexec(regex, src_string, 10, pmatch, regexec_flags) != 0) + break; + eo = pmatch[0].rm_eo - start_ofs; + so = pmatch[0].rm_so - start_ofs; +#else +// BUG: +// gsub(/\<b*/,"") on "abc" matches empty string at "a...", +// advances sp one char (see "Empty match" comment later) to "bc" +// ... and erroneously matches "b" even though it is NOT at the word start. + enum { start_ofs = 0 }; + if (regexec(regex, sp, 10, pmatch, regexec_flags) != 0) + break; + so = pmatch[0].rm_so; + eo = pmatch[0].rm_eo; +#endif //bb_error_msg("match %u: [%u,%u] '%s'%p", match_no+1, so, eo, sp,sp); resbuf = qrealloc(resbuf, residx + eo + replen, &resbufsize); @@ -2543,7 +2572,7 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest /*,in } n = pmatch[j].rm_eo - pmatch[j].rm_so; resbuf = qrealloc(resbuf, residx + replen + n, &resbufsize); - memcpy(resbuf + residx, sp + pmatch[j].rm_so, n); + memcpy(resbuf + residx, sp + pmatch[j].rm_so - start_ofs, n); residx += n; } else resbuf[residx++] = c; @@ -2557,12 +2586,6 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest /*,in if (eo == so) { /* Empty match (e.g. "b*" will match anywhere). * Advance by one char. */ -//BUG (bug 1333): -//gsub(/\<b*/,"") on "abc" will reach this point, advance to "bc" -//... and will erroneously match "b" even though it is NOT at the word start. -//we need REG_NOTBOW but it does not exist... -//TODO: if EXTRA_COMPAT=y, use GNU matching and re_search, -//it should be able to do it correctly. /* Subtle: this is safe only because * qrealloc allocated at least one extra byte */ resbuf[residx] = *sp; @@ -2571,7 +2594,7 @@ static int awk_sub(node *rn, const char *repl, int nm, var *src, var *dest /*,in sp++; residx++; } - regexec_flags = REG_NOTBOL; + regexec_flags |= REG_NOTBOL; } resbuf = qrealloc(resbuf, residx + strlen(sp), &resbufsize); diff --git a/testsuite/awk.tests b/testsuite/awk.tests index c61d329..5a792c2 100755 --- a/testsuite/awk.tests +++ b/testsuite/awk.tests @@ -557,14 +557,12 @@ testing 'awk gensub backslashes \' \ 'awk '$sq'BEGIN { s="\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ 's=\\ \\|\\ -' \ - '' '' +' '' '' testing 'awk gensub backslashes \\' \ 'awk '$sq'BEGIN { s="\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ 's=\\\\ \\|\\ -' \ - '' '' +' '' '' # gawk 5.1.1 handles trailing unpaired \ inconsistently. # If replace string is single \, it is used verbatim, # but if it is \\\ (three slashes), gawk uses "\<NUL>" (!!!), not "\\" as you would expect. @@ -572,31 +570,35 @@ testing 'awk gensub backslashes \\\' \ 'awk '$sq'BEGIN { s="\\\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ 's=\\\\\\ \\\\|\\\\ -' \ - '' '' +' '' '' testing 'awk gensub backslashes \\\\' \ 'awk '$sq'BEGIN { s="\\\\\\\\"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ 's=\\\\\\\\ \\\\|\\\\ -' \ - '' '' +' '' '' testing 'awk gensub backslashes \&' \ 'awk '$sq'BEGIN { s="\\&"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ 's=\\& &|& -' \ - '' '' +' '' '' testing 'awk gensub backslashes \0' \ 'awk '$sq'BEGIN { s="\\0"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ 's=\\0 a|a -' \ - '' '' +' '' '' testing 'awk gensub backslashes \\0' \ 'awk '$sq'BEGIN { s="\\\\0"; print "s=" s; print gensub("a", s, "g", "a|a") }'$sq \ 's=\\\\0 \\0|\\0 -' \ +' '' '' + +# The "b" in "abc" should not match <b* pattern. +# Currently we use REG_STARTEND ("This flag is a BSD extension, not present in POSIX") +# to implement the code to handle this correctly, but if your libc has no REG_STARTEND, +# the alternative code mishandles this case. +testing 'awk gsub erroneous word start match' \ + "awk 'BEGIN { a=\"abc\"; gsub(/\<b*/,\"\",a); print a }'" \ + 'abc\n' \ '' '' exit $FAILCOUNT |