From 21f6fbf545e7fa58f0eaa444001a9d25bc37c4eb Mon Sep 17 00:00:00 2001 From: Denys Vlasenko Date: Mon, 4 Jun 2012 14:44:47 +0200 Subject: sed: fix zero chars match/replace function old new delta process_files 2099 2181 +82 Signed-off-by: Denys Vlasenko --- editors/sed.c | 64 +++++++++++++++++++++++++++++++++++------------------ testsuite/sed.tests | 10 +++++++-- 2 files changed, 50 insertions(+), 24 deletions(-) diff --git a/editors/sed.c b/editors/sed.c index a2df931..87fc755 100644 --- a/editors/sed.c +++ b/editors/sed.c @@ -673,7 +673,7 @@ static void do_subst_w_backrefs(char *line, char *replace) /* go through the replacement string */ for (i = 0; replace[i]; i++) { - /* if we find a backreference (\1, \2, etc.) print the backref'ed * text */ + /* if we find a backreference (\1, \2, etc.) print the backref'ed text */ if (replace[i] == '\\') { unsigned backref = replace[++i] - '0'; if (backref <= 9) { @@ -707,8 +707,10 @@ static void do_subst_w_backrefs(char *line, char *replace) static int do_subst_command(sed_cmd_t *sed_cmd, char **line_p) { char *line = *line_p; - int altered = 0; unsigned match_count = 0; + bool altered = 0; + bool prev_match_empty = 1; + bool tried_at_eol = 0; regex_t *current_regex; current_regex = sed_cmd->sub_match; @@ -737,46 +739,64 @@ static int do_subst_command(sed_cmd_t *sed_cmd, char **line_p) do { int i; - /* Work around bug in glibc regexec, demonstrated by: - * echo " a.b" | busybox sed 's [^ .]* x g' - * The match_count check is so not to break - * echo "hi" | busybox sed 's/^/!/g' - */ - if (!G.regmatch[0].rm_so && !G.regmatch[0].rm_eo && match_count) { - pipe_putc(*line++); - goto next; - } - match_count++; /* If we aren't interested in this match, output old line to - end of match and continue */ + * end of match and continue */ if (sed_cmd->which_match && (sed_cmd->which_match != match_count) ) { for (i = 0; i < G.regmatch[0].rm_eo; i++) pipe_putc(*line++); + /* Null match? Print one more char */ + if (G.regmatch[0].rm_so == i && *line) + pipe_putc(*line++); goto next; } - /* print everything before the match */ + /* Print everything before the match */ for (i = 0; i < G.regmatch[0].rm_so; i++) pipe_putc(line[i]); - /* then print the substitution string */ - do_subst_w_backrefs(line, sed_cmd->string); + /* Then print the substitution string, + * unless we just matched empty string after non-empty one. + * Example: string "cccd", pattern "c*", repl "R": + * result is "RdR", not "RRdR": first match "ccc", + * second is "" before "d", third is "" after "d". + * Second match is NOT replaced! + */ + if (prev_match_empty || i != 0) { + dbg("inserting replacement at %d in '%s'", i, line); + do_subst_w_backrefs(line, sed_cmd->string); + } else { + dbg("NOT inserting replacement at %d in '%s'", i, line); + } + + /* If matched string is empty (f.e. "c*" pattern), + * copy verbatim one char after it before attempting more matches + */ + prev_match_empty = (G.regmatch[0].rm_eo == i); + if (prev_match_empty && line[i]) { + pipe_putc(line[i]); + G.regmatch[0].rm_eo++; + } - /* advance past the match */ + /* Advance past the match */ + dbg("line += %d", G.regmatch[0].rm_eo); line += G.regmatch[0].rm_eo; - /* flag that something has changed */ - altered++; + /* Flag that something has changed */ + altered = 1; /* if we're not doing this globally, get out now */ if (sed_cmd->which_match != 0) break; next: - if (*line == '\0') - break; + /* Exit if we are at EOL and already tried matching at it */ + if (*line == '\0') { + if (tried_at_eol) + break; + tried_at_eol = 1; + } //maybe (G.regmatch[0].rm_eo ? REG_NOTBOL : 0) instead of unconditional REG_NOTBOL? } while (regexec(current_regex, line, 10, G.regmatch, REG_NOTBOL) != REG_NOMATCH); @@ -1127,7 +1147,7 @@ static void process_files(void) case 's': if (!do_subst_command(sed_cmd, &pattern_space)) break; - dbg("do_subst_command succeeeded:'%s'", pattern_space); + dbg("do_subst_command succeeded:'%s'", pattern_space); substituted |= 1; /* handle p option */ diff --git a/testsuite/sed.tests b/testsuite/sed.tests index 9fa8e19..375beb5 100755 --- a/testsuite/sed.tests +++ b/testsuite/sed.tests @@ -52,10 +52,8 @@ testing "sed with empty match" "sed 's/z*//g'" "string\n" "" "string\n" testing "sed s//p" "sed -e s/foo/bar/p -e s/bar/baz/p" "bar\nbaz\nbaz\n" \ "" "foo\n" testing "sed -n s//p" "sed -ne s/abc/def/p" "def\n" "" "abc\n" -test x"$SKIP_KNOWN_BUGS" = x"" && { testing "sed s//g (exhaustive)" "sed -e 's/[[:space:]]*/,/g'" ",1,2,3,4,5,\n" \ "" "12345\n" -} testing "sed s arbitrary delimiter" "sed -e 's woo boing '" "boing\n" "" "woo\n" testing "sed s chains" "sed -e s/foo/bar/ -e s/bar/baz/" "baz\n" "" "foo\n" testing "sed s chains2" "sed -e s/foo/bar/ -e s/baz/nee/" "bar\n" "" "foo\n" @@ -296,6 +294,14 @@ testing "sed -i finishes ranges correctly" \ "sed '1,2d' -i input; echo \$?; cat input" \ "0\n3\n4\n" "1\n2\n3\n4\n" "" +testing "sed zero chars match/replace advances correctly 1" \ + "sed 's/l*/@/g'" \ + "@h@e@o@\n" "" "helllo\n" + +testing "sed zero chars match/replace advances correctly 2" \ + "sed 's [^ .]* x g'" \ + "x x.x\n" "" " a.b\n" + # testing "description" "commands" "result" "infile" "stdin" exit $FAILCOUNT -- cgit v1.1