summaryrefslogtreecommitdiff
path: root/regexp.c
diff options
context:
space:
mode:
authorMark Whitley2000-07-10 23:07:54 +0000
committerMark Whitley2000-07-10 23:07:54 +0000
commitcc54d12ef716e9206344c18006b46386fe49ba9b (patch)
treec2f88581d8e81aef6de5010941380e1cb872f463 /regexp.c
parentdd23b8bb431d27ea95534f528a1e1381e5841b8b (diff)
downloadbusybox-cc54d12ef716e9206344c18006b46386fe49ba9b.zip
busybox-cc54d12ef716e9206344c18006b46386fe49ba9b.tar.gz
Removed all trace of hand-tooled regexp routines. Bye bye, baby.
Diffstat (limited to 'regexp.c')
-rw-r--r--regexp.c789
1 files changed, 0 insertions, 789 deletions
diff --git a/regexp.c b/regexp.c
deleted file mode 100644
index 6fedb01..0000000
--- a/regexp.c
+++ /dev/null
@@ -1,789 +0,0 @@
-/* vi: set sw=4 ts=4: */
-/* regexp.c */
-
-#include "internal.h"
-#include "regexp.h"
-#include <setjmp.h>
-#include <stdio.h>
-#include <ctype.h>
-
-
-#define NSUBEXP 10
-typedef struct regexp {
- char *startp[NSUBEXP];
- char *endp[NSUBEXP];
- int minlen; /* length of shortest possible match */
- char first; /* first character, if known; else \0 */
- char bol; /* boolean: must start at beginning of line? */
- char program[1]; /* Unwarranted chumminess with compiler. */
-} regexp;
-
-
-static regexp *regcomp(char* text);
-static int regexec(struct regexp* re, char* str, int bol, int ignoreCase);
-static void regsub(struct regexp* re, char* src, char* dst);
-
-#if ( defined BB_GREP || defined BB_SED)
-
-/* This also tries to find a needle in a haystack, but uses
- * real regular expressions.... The fake regular expression
- * version of find_match lives in utility.c. Using this version
- * will add 3.9k to busybox...
- * -Erik Andersen
- */
-extern int find_match(char *haystack, char *needle, int ignoreCase)
-{
- int status;
- struct regexp *re;
-
- re = regcomp(needle);
- status = regexec(re, haystack, FALSE, ignoreCase);
- free(re);
- return (status);
-}
-
-#if defined BB_SED
-/* This performs substitutions after a regexp match has been found.
- * The new string is returned. It is malloc'ed, and do must be freed. */
-extern int replace_match(char *haystack, char *needle, char *newNeedle,
- int ignoreCase)
-{
- int status;
- struct regexp *re;
- char *s, buf[BUF_SIZE], *d = buf;
-
- re = regcomp(needle);
- status = regexec(re, haystack, FALSE, ignoreCase);
- if (status == TRUE) {
- s = haystack;
-
- do {
- /* copy stuff from before the match */
- while (s < re->startp[0])
- *d++ = *s++;
- /* substitute for the matched part */
- regsub(re, newNeedle, d);
- s = re->endp[0];
- d += strlen(d);
- } while (regexec(re, s, FALSE, ignoreCase) == TRUE);
- /* copy stuff from after the match */
- while ((*d++ = *s++)) {
- }
- d[0] = '\0';
- strcpy(haystack, buf);
- }
- free(re);
- return (status);
-}
-#endif
-
-
-/* code swiped from elvis-tiny 1.4 (a clone of vi) and adjusted to
- * suit the needs of busybox by Erik Andersen.
- *
- * From the README:
- * "Elvis is freely redistributable, in either source form or executable form.
- * There are no restrictions on how you may use it".
- * Elvis was written by Steve Kirkendall <kirkenda@cs.pdx.edu>
- *
- *
- * This file contains the code that compiles regular expressions and executes
- * them. It supports the same syntax and features as vi's regular expression
- * code. Specifically, the meta characters are:
- * ^ matches the beginning of a line
- * $ matches the end of a line
- * \< matches the beginning of a word
- * \> matches the end of a word
- * . matches any single character
- * [] matches any character in a character class
- * \( delimits the start of a subexpression
- * \) delimits the end of a subexpression
- * * repeats the preceding 0 or more times
- * NOTE: You cannot follow a \) with a *.
- *
- * The physical structure of a compiled RE is as follows:
- * - First, there is a one-byte value that says how many character classes
- * are used in this regular expression
- * - Next, each character class is stored as a bitmap that is 256 bits
- * (32 bytes) long.
- * - A mixture of literal characters and compiled meta characters follows.
- * This begins with M_BEGIN(0) and ends with M_END(0). All meta chars
- * are stored as a \n followed by a one-byte code, so they take up two
- * bytes apiece. Literal characters take up one byte apiece. \n can't
- * be used as a literal character.
- *
- */
-
-
-
-static char *previous; /* the previous regexp, used when null regexp is given */
-
-#if defined BB_SED
-static char *previous1; /* a copy of the text from the previous substitution for regsub() */
-#endif
-
-
-/* These are used to classify or recognize meta-characters */
-#define META '\0'
-#define BASE_META(m) ((m) - 256)
-#define INT_META(c) ((c) + 256)
-#define IS_META(m) ((m) >= 256)
-#define IS_CLASS(m) ((m) >= M_CLASS(0) && (m) <= M_CLASS(9))
-#define IS_START(m) ((m) >= M_START(0) && (m) <= M_START(9))
-#define IS_END(m) ((m) >= M_END(0) && (m) <= M_END(9))
-#define IS_CLOSURE(m) ((m) >= M_SPLAT && (m) <= M_QMARK)
-#define ADD_META(s,m) (*(s)++ = META, *(s)++ = BASE_META(m))
-#define GET_META(s) (*(s) == META ? INT_META(*++(s)) : *s)
-
-/* These are the internal codes used for each type of meta-character */
-#define M_BEGLINE 256 /* internal code for ^ */
-#define M_ENDLINE 257 /* internal code for $ */
-#define M_BEGWORD 258 /* internal code for \< */
-#define M_ENDWORD 259 /* internal code for \> */
-#define M_ANY 260 /* internal code for . */
-#define M_SPLAT 261 /* internal code for * */
-#define M_PLUS 262 /* internal code for \+ */
-#define M_QMARK 263 /* internal code for \? */
-#define M_CLASS(n) (264+(n)) /* internal code for [] */
-#define M_START(n) (274+(n)) /* internal code for \( */
-#define M_END(n) (284+(n)) /* internal code for \) */
-
-/* These are used during compilation */
-static int class_cnt; /* used to assign class IDs */
-static int start_cnt; /* used to assign start IDs */
-static int end_stk[NSUBEXP]; /* used to assign end IDs */
-static int end_sp;
-static char *retext; /* points to the text being compiled */
-
-/* error-handling stuff */
-jmp_buf errorhandler;
-
-#define FAIL(why) do {fprintf(stderr, why); longjmp(errorhandler, 1);} while (0)
-
-
-
-
-/* This function builds a bitmap for a particular class */
-/* text -- start of the class */
-/* bmap -- the bitmap */
-static char *makeclass(char *text, char *bmap)
-{
- int i;
- int complement = 0;
-
-
- /* zero the bitmap */
- for (i = 0; bmap && i < 32; i++) {
- bmap[i] = 0;
- }
-
- /* see if we're going to complement this class */
- if (*text == '^') {
- text++;
- complement = 1;
- }
-
- /* add in the characters */
- while (*text && *text != ']') {
- /* is this a span of characters? */
- if (text[1] == '-' && text[2]) {
- /* spans can't be backwards */
- if (text[0] > text[2]) {
- FAIL("Backwards span in []");
- }
-
- /* add each character in the span to the bitmap */
- for (i = text[0]; bmap && i <= text[2]; i++) {
- bmap[i >> 3] |= (1 << (i & 7));
- }
-
- /* move past this span */
- text += 3;
- } else {
- /* add this single character to the span */
- i = *text++;
- if (bmap) {
- bmap[i >> 3] |= (1 << (i & 7));
- }
- }
- }
-
- /* make sure the closing ] is missing */
- if (*text++ != ']') {
- FAIL("] missing");
- }
-
- /* if we're supposed to complement this class, then do so */
- if (complement && bmap) {
- for (i = 0; i < 32; i++) {
- bmap[i] = ~bmap[i];
- }
- }
-
- return text;
-}
-
-
-
-
-/* This function gets the next character or meta character from a string.
- * The pointer is incremented by 1, or by 2 for \-quoted characters. For [],
- * a bitmap is generated via makeclass() (if re is given), and the
- * character-class text is skipped.
- */
-static int gettoken(sptr, re)
-char **sptr;
-regexp *re;
-{
- int c;
-
- c = **sptr;
- ++*sptr;
- if (c == '\\') {
- c = **sptr;
- ++*sptr;
- switch (c) {
- case '<':
- return M_BEGWORD;
-
- case '>':
- return M_ENDWORD;
-
- case '(':
- if (start_cnt >= NSUBEXP) {
- FAIL("Too many \\(s");
- }
- end_stk[end_sp++] = start_cnt;
- return M_START(start_cnt++);
-
- case ')':
- if (end_sp <= 0) {
- FAIL("Mismatched \\)");
- }
- return M_END(end_stk[--end_sp]);
-
- case '*':
- return M_SPLAT;
-
- case '.':
- return M_ANY;
-
- case '+':
- return M_PLUS;
-
- case '?':
- return M_QMARK;
-
- default:
- return c;
- }
- } else {
- switch (c) {
- case '^':
- if (*sptr == retext + 1) {
- return M_BEGLINE;
- }
- return c;
-
- case '$':
- if (!**sptr) {
- return M_ENDLINE;
- }
- return c;
-
- case '.':
- return M_ANY;
-
- case '*':
- return M_SPLAT;
-
- case '[':
- /* make sure we don't have too many classes */
- if (class_cnt >= 10) {
- FAIL("Too many []s");
- }
-
- /* process the character list for this class */
- if (re) {
- /* generate the bitmap for this class */
- *sptr = makeclass(*sptr, re->program + 1 + 32 * class_cnt);
- } else {
- /* skip to end of the class */
- *sptr = makeclass(*sptr, (char *) 0);
- }
- return M_CLASS(class_cnt++);
-
- default:
- return c;
- }
- }
- /*NOTREACHED*/}
-
-
-
-
-/* This function calculates the number of bytes that will be needed for a
- * compiled RE. Its argument is the uncompiled version. It is not clever
- * about catching syntax errors; that is done in a later pass.
- */
-static unsigned calcsize(text)
-char *text;
-{
- unsigned size;
- int token;
-
- retext = text;
- class_cnt = 0;
- start_cnt = 1;
- end_sp = 0;
- size = 5;
- while ((token = gettoken(&text, (regexp *) 0)) != 0) {
- if (IS_CLASS(token)) {
- size += 34;
- } else if (IS_META(token)) {
- size += 2;
- } else {
- size++;
- }
- }
-
- return size;
-}
-
-
-
-/*---------------------------------------------------------------------------*/
-
-
-/* This function checks for a match between a character and a token which is
- * known to represent a single character. It returns 0 if they match, or
- * 1 if they don't.
- */
-static int match1(regexp * re, char ch, int token, int ignoreCase)
-{
- if (!ch) {
- /* the end of a line can't match any RE of width 1 */
- return 1;
- }
- if (token == M_ANY) {
- return 0;
- } else if (IS_CLASS(token)) {
- if (re->program[1 + 32 * (token - M_CLASS(0)) + (ch >> 3)] & (1 << (ch & 7)))
- return 0;
- }
-//fprintf(stderr, "match1: ch='%c' token='%c': ", ch, token);
- if (ch == token || (ignoreCase == TRUE && tolower(ch) == tolower(token))) {
-//fprintf(stderr, "match\n");
- return 0;
- }
-//fprintf(stderr, "no match\n");
- return 1;
-}
-
-
-
-/* This function checks characters up to and including the next closure, at
- * which point it does a recursive call to check the rest of it. This function
- * returns 0 if everything matches, or 1 if something doesn't match.
- */
-/* re -- the regular expression */
-/* str -- the string */
-/* prog -- a portion of re->program, an compiled RE */
-/* here -- a portion of str, the string to compare it to */
-static int match(regexp * re, char *str, char *prog, char *here,
- int ignoreCase)
-{
- int token;
- int nmatched;
- int closure;
-
- for (token = GET_META(prog); !IS_CLOSURE(token);
- prog++, token = GET_META(prog)) {
- switch (token) {
- /*case M_BEGLINE: can't happen; re->bol is used instead */
- case M_ENDLINE:
- if (*here)
- return 1;
- break;
-
- case M_BEGWORD:
- if (here != str &&
- (here[-1] == '_' ||
- (isascii(here[-1]) && isalnum(here[-1])))) return 1;
- break;
-
- case M_ENDWORD:
- if ((here[0] == '_' || isascii(here[0])) && isalnum(here[0]))
- return 1;
- break;
-
- case M_START(0):
- case M_START(1):
- case M_START(2):
- case M_START(3):
- case M_START(4):
- case M_START(5):
- case M_START(6):
- case M_START(7):
- case M_START(8):
- case M_START(9):
- re->startp[token - M_START(0)] = (char *) here;
- break;
-
- case M_END(0):
- case M_END(1):
- case M_END(2):
- case M_END(3):
- case M_END(4):
- case M_END(5):
- case M_END(6):
- case M_END(7):
- case M_END(8):
- case M_END(9):
- re->endp[token - M_END(0)] = (char *) here;
- if (token == M_END(0)) {
- return 0;
- }
- break;
-
- default: /* literal, M_CLASS(n), or M_ANY */
- if (match1(re, *here, token, ignoreCase) != 0)
- return 1;
- here++;
- }
- }
-
- /* C L O S U R E */
-
- /* step 1: see what we have to match against, and move "prog" to point
- * the the remainder of the compiled RE.
- */
- closure = token;
- prog++, token = GET_META(prog);
- prog++;
-
- /* step 2: see how many times we can match that token against the string */
- for (nmatched = 0;
- (closure != M_QMARK || nmatched < 1) && *here
- && match1(re, *here, token, ignoreCase) == 0; nmatched++, here++) {
- }
-
- /* step 3: try to match the remainder, and back off if it doesn't */
- while (nmatched >= 0 && match(re, str, prog, here, ignoreCase) != 0) {
- nmatched--;
- here--;
- }
-
- /* so how did it work out? */
- if (nmatched >= ((closure == M_PLUS) ? 1 : 0))
- return 0;
- return 1;
-}
-
-
-/* This function compiles a regexp. */
-static regexp *regcomp(char *text)
-{
- int needfirst;
- unsigned size;
- int token;
- int peek;
- char *build;
- regexp *re;
-
-
- /* prepare for error handling */
- re = (regexp *) 0;
- if (setjmp(errorhandler)) {
- if (re) {
- free(re);
- }
- return (regexp *) 0;
- }
-
- /* if an empty regexp string was given, use the previous one */
- if (*text == 0) {
- if (!previous) {
- FAIL("No previous RE");
- }
- text = previous;
- } else { /* non-empty regexp given, so remember it */
-
- if (previous)
- free(previous);
- previous = (char *) malloc((unsigned) (strlen(text) + 1));
- if (previous)
- strcpy(previous, text);
- }
-
- /* allocate memory */
- class_cnt = 0;
- start_cnt = 1;
- end_sp = 0;
- retext = text;
- size = calcsize(text) + sizeof(regexp);
- re = (regexp *) malloc((unsigned) size);
-
- if (!re) {
- FAIL("Not enough memory for this RE");
- }
-
- /* compile it */
- build = &re->program[1 + 32 * class_cnt];
- re->program[0] = class_cnt;
- for (token = 0; token < NSUBEXP; token++) {
- re->startp[token] = re->endp[token] = (char *) 0;
- }
- re->first = 0;
- re->bol = 0;
- re->minlen = 0;
- needfirst = 1;
- class_cnt = 0;
- start_cnt = 1;
- end_sp = 0;
- retext = text;
- for (token = M_START(0), peek = gettoken(&text, re);
- token; token = peek, peek = gettoken(&text, re)) {
-
- /* special processing for the closure operator */
- if (IS_CLOSURE(peek)) {
-
- /* detect misuse of closure operator */
- if (IS_START(token)) {
- FAIL("* or \\+ or \\? follows nothing");
- } else if (IS_META(token) && token != M_ANY && !IS_CLASS(token)) {
- FAIL("* or \\+ or \\? can only follow a normal character or . or []");
- }
-
- /* it is okay -- make it prefix instead of postfix */
- ADD_META(build, peek);
-
- /* take care of "needfirst" - is this the first char? */
- if (needfirst && peek == M_PLUS && !IS_META(token)) {
- re->first = token;
- }
- needfirst = 0;
-
- /* we used "peek" -- need to refill it */
- peek = gettoken(&text, re);
- if (IS_CLOSURE(peek)) {
- FAIL("* or \\+ or \\? doubled up");
- }
- } else if (!IS_META(token)) {
- /* normal char is NOT argument of closure */
- if (needfirst) {
- re->first = token;
- needfirst = 0;
- }
- re->minlen++;
- } else if (token == M_ANY || IS_CLASS(token)) {
- /* . or [] is NOT argument of closure */
- needfirst = 0;
- re->minlen++;
- }
-
- /* the "token" character is not closure -- process it normally */
- if (token == M_BEGLINE) {
- /* set the BOL flag instead of storing M_BEGLINE */
- re->bol = 1;
- } else if (IS_META(token)) {
- ADD_META(build, token);
- } else {
- *build++ = token;
- }
- }
-
- /* end it with a \) which MUST MATCH the opening \( */
- ADD_META(build, M_END(0));
- if (end_sp > 0) {
- FAIL("Not enough \\)s");
- }
-
- return re;
-}
-
-
-
-
-/* This function searches through a string for text that matches an RE. */
-/* re -- the compiled regexp to search for */
-/* str -- the string to search through */
-/* bol -- does str start at the beginning of a line? (boolean) */
-/* ignoreCase -- ignoreCase or not */
-static int regexec(struct regexp *re, char *str, int bol, int ignoreCase)
-{
- char *prog; /* the entry point of re->program */
- int len; /* length of the string */
- char *here;
-
- /* if must start at the beginning of a line, and this isn't, then fail */
- if (re->bol && bol == TRUE) {
- return FALSE;
- }
-
- len = strlen(str);
- prog = re->program + 1 + 32 * re->program[0];
-
- /* search for the RE in the string */
- if (re->bol) {
- /* must occur at BOL */
- if ((re->first && match1(re, *(char *) str, re->first, ignoreCase)) /* wrong first letter? */
- ||len < re->minlen /* not long enough? */
- || match(re, (char *) str, prog, str, ignoreCase)) /* doesn't match? */
- return FALSE; /* THEN FAIL! */
- } else if (ignoreCase == FALSE) {
- /* can occur anywhere in the line, noignorecase */
- for (here = (char *) str; (re->first && re->first != *here)
- || match(re, (char *) str, prog, here, ignoreCase);
- here++, len--) {
- if (len < re->minlen)
- return FALSE;
- }
- } else {
- /* can occur anywhere in the line, ignorecase */
- for (here = (char *) str;
- (re->first && match1(re, *here, (int) re->first, ignoreCase))
- || match(re, (char *) str, prog, here, ignoreCase);
- here++, len--) {
- if (len < re->minlen)
- return FALSE;
- }
- }
-
- /* if we didn't fail, then we must have succeeded */
- return TRUE;
-}
-
-
-
-
-#if defined BB_SED
-/* This performs substitutions after a regexp match has been found. */
-static void regsub(regexp * re, char *src, char *dst)
-{
- char *cpy;
- char *end;
- char c;
- char *start;
- int mod;
-
- mod = 0;
-
- start = src;
- while ((c = *src++) != '\0') {
- /* recognize any meta characters */
- if (c == '&') {
- cpy = re->startp[0];
- end = re->endp[0];
- } else if (c == '~') {
- cpy = previous1;
- if (cpy)
- end = cpy + strlen(cpy);
- } else if (c == '\\') {
- c = *src++;
- switch (c) {
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- case '8':
- case '9':
- /* \0 thru \9 mean "copy subexpression" */
- c -= '0';
- cpy = re->startp[(int) c];
- end = re->endp[(int) c];
- break;
- case 'U':
- case 'u':
- case 'L':
- case 'l':
- /* \U and \L mean "convert to upper/lowercase" */
- mod = c;
- continue;
-
- case 'E':
- case 'e':
- /* \E ends the \U or \L */
- mod = 0;
- continue;
- case '&':
- /* "\&" means "original text" */
- *dst++ = c;
- continue;
-
- case '~':
- /* "\~" means "previous text, if any" */
- *dst++ = c;
- continue;
- default:
- /* ordinary char preceded by backslash */
- *dst++ = c;
- continue;
- }
- } else {
- /* ordinary character, so just copy it */
- *dst++ = c;
- continue;
- }
-
- /* Note: to reach this point in the code, we must have evaded
- * all "continue" statements. To do that, we must have hit
- * a metacharacter that involves copying.
- */
-
- /* if there is nothing to copy, loop */
- if (!cpy)
- continue;
-
- /* copy over a portion of the original */
- while (cpy < end) {
- switch (mod) {
- case 'U':
- case 'u':
- /* convert to uppercase */
- if (isascii(*cpy) && islower(*cpy)) {
- *dst++ = toupper(*cpy);
- cpy++;
- } else {
- *dst++ = *cpy++;
- }
- break;
-
- case 'L':
- case 'l':
- /* convert to lowercase */
- if (isascii(*cpy) && isupper(*cpy)) {
- *dst++ = tolower(*cpy);
- cpy++;
- } else {
- *dst++ = *cpy++;
- }
- break;
-
- default:
- /* copy without any conversion */
- *dst++ = *cpy++;
- }
-
- /* \u and \l end automatically after the first char */
- if (mod && (mod == 'u' || mod == 'l')) {
- mod = 0;
- }
- }
- }
- *dst = '\0';
-
- /* remember what text we inserted this time */
- if (previous1)
- free(previous1);
- previous1 = (char *) malloc((unsigned) (strlen(start) + 1));
- if (previous1)
- strcpy(previous1, start);
-}
-#endif
-
-#endif /* BB_REGEXP */