| /* vi: set sw=4 ts=4: */ |
| /* regexp.c */ |
| |
| #include "internal.h" |
| #include "regexp.h" |
| #include <setjmp.h> |
| #include <stdio.h> |
| #include <ctype.h> |
| |
| |
| #if ( defined BB_GREP || defined BB_SED) |
| |
| /* This also tries to find a needle in a haystack, but uses |
| * real regular expressions.... The fake regular expression |
| * version of find_match lives in utility.c. Using this version |
| * will add 3.9k to busybox... |
| * -Erik Andersen |
| */ |
| extern int find_match(char *haystack, char *needle, int ignoreCase) |
| { |
| int status; |
| struct regexp *re; |
| |
| re = regcomp(needle); |
| status = regexec(re, haystack, FALSE, ignoreCase); |
| free(re); |
| return (status); |
| } |
| |
| #if defined BB_SED |
| /* This performs substitutions after a regexp match has been found. |
| * The new string is returned. It is malloc'ed, and do must be freed. */ |
| extern int replace_match(char *haystack, char *needle, char *newNeedle, |
| int ignoreCase) |
| { |
| int status; |
| struct regexp *re; |
| char *s, buf[BUF_SIZE], *d = buf; |
| |
| re = regcomp(needle); |
| status = regexec(re, haystack, FALSE, ignoreCase); |
| if (status == TRUE) { |
| s = haystack; |
| |
| do { |
| /* copy stuff from before the match */ |
| while (s < re->startp[0]) |
| *d++ = *s++; |
| /* substitute for the matched part */ |
| regsub(re, newNeedle, d); |
| s = re->endp[0]; |
| d += strlen(d); |
| } while (regexec(re, s, FALSE, ignoreCase) == TRUE); |
| /* copy stuff from after the match */ |
| while ((*d++ = *s++)) { |
| } |
| d[0] = '\0'; |
| strcpy(haystack, buf); |
| } |
| free(re); |
| return (status); |
| } |
| #endif |
| |
| |
| /* code swiped from elvis-tiny 1.4 (a clone of vi) and adjusted to |
| * suit the needs of busybox by Erik Andersen. |
| * |
| * From the README: |
| * "Elvis is freely redistributable, in either source form or executable form. |
| * There are no restrictions on how you may use it". |
| * Elvis was written by Steve Kirkendall <kirkenda@cs.pdx.edu> |
| * |
| * |
| * This file contains the code that compiles regular expressions and executes |
| * them. It supports the same syntax and features as vi's regular expression |
| * code. Specifically, the meta characters are: |
| * ^ matches the beginning of a line |
| * $ matches the end of a line |
| * \< matches the beginning of a word |
| * \> matches the end of a word |
| * . matches any single character |
| * [] matches any character in a character class |
| * \( delimits the start of a subexpression |
| * \) delimits the end of a subexpression |
| * * repeats the preceding 0 or more times |
| * NOTE: You cannot follow a \) with a *. |
| * |
| * The physical structure of a compiled RE is as follows: |
| * - First, there is a one-byte value that says how many character classes |
| * are used in this regular expression |
| * - Next, each character class is stored as a bitmap that is 256 bits |
| * (32 bytes) long. |
| * - A mixture of literal characters and compiled meta characters follows. |
| * This begins with M_BEGIN(0) and ends with M_END(0). All meta chars |
| * are stored as a \n followed by a one-byte code, so they take up two |
| * bytes apiece. Literal characters take up one byte apiece. \n can't |
| * be used as a literal character. |
| * |
| */ |
| |
| |
| |
| static char *previous; /* the previous regexp, used when null regexp is given */ |
| |
| #if defined BB_SED |
| static char *previous1; /* a copy of the text from the previous substitution for regsub() */ |
| #endif |
| |
| |
| /* These are used to classify or recognize meta-characters */ |
| #define META '\0' |
| #define BASE_META(m) ((m) - 256) |
| #define INT_META(c) ((c) + 256) |
| #define IS_META(m) ((m) >= 256) |
| #define IS_CLASS(m) ((m) >= M_CLASS(0) && (m) <= M_CLASS(9)) |
| #define IS_START(m) ((m) >= M_START(0) && (m) <= M_START(9)) |
| #define IS_END(m) ((m) >= M_END(0) && (m) <= M_END(9)) |
| #define IS_CLOSURE(m) ((m) >= M_SPLAT && (m) <= M_QMARK) |
| #define ADD_META(s,m) (*(s)++ = META, *(s)++ = BASE_META(m)) |
| #define GET_META(s) (*(s) == META ? INT_META(*++(s)) : *s) |
| |
| /* These are the internal codes used for each type of meta-character */ |
| #define M_BEGLINE 256 /* internal code for ^ */ |
| #define M_ENDLINE 257 /* internal code for $ */ |
| #define M_BEGWORD 258 /* internal code for \< */ |
| #define M_ENDWORD 259 /* internal code for \> */ |
| #define M_ANY 260 /* internal code for . */ |
| #define M_SPLAT 261 /* internal code for * */ |
| #define M_PLUS 262 /* internal code for \+ */ |
| #define M_QMARK 263 /* internal code for \? */ |
| #define M_CLASS(n) (264+(n)) /* internal code for [] */ |
| #define M_START(n) (274+(n)) /* internal code for \( */ |
| #define M_END(n) (284+(n)) /* internal code for \) */ |
| |
| /* These are used during compilation */ |
| static int class_cnt; /* used to assign class IDs */ |
| static int start_cnt; /* used to assign start IDs */ |
| static int end_stk[NSUBEXP]; /* used to assign end IDs */ |
| static int end_sp; |
| static char *retext; /* points to the text being compiled */ |
| |
| /* error-handling stuff */ |
| jmp_buf errorhandler; |
| |
| #define FAIL(why) do {fprintf(stderr, why); longjmp(errorhandler, 1);} while (0) |
| |
| |
| |
| |
| /* This function builds a bitmap for a particular class */ |
| /* text -- start of the class */ |
| /* bmap -- the bitmap */ |
| static char *makeclass(char *text, char *bmap) |
| { |
| int i; |
| int complement = 0; |
| |
| |
| /* zero the bitmap */ |
| for (i = 0; bmap && i < 32; i++) { |
| bmap[i] = 0; |
| } |
| |
| /* see if we're going to complement this class */ |
| if (*text == '^') { |
| text++; |
| complement = 1; |
| } |
| |
| /* add in the characters */ |
| while (*text && *text != ']') { |
| /* is this a span of characters? */ |
| if (text[1] == '-' && text[2]) { |
| /* spans can't be backwards */ |
| if (text[0] > text[2]) { |
| FAIL("Backwards span in []"); |
| } |
| |
| /* add each character in the span to the bitmap */ |
| for (i = text[0]; bmap && i <= text[2]; i++) { |
| bmap[i >> 3] |= (1 << (i & 7)); |
| } |
| |
| /* move past this span */ |
| text += 3; |
| } else { |
| /* add this single character to the span */ |
| i = *text++; |
| if (bmap) { |
| bmap[i >> 3] |= (1 << (i & 7)); |
| } |
| } |
| } |
| |
| /* make sure the closing ] is missing */ |
| if (*text++ != ']') { |
| FAIL("] missing"); |
| } |
| |
| /* if we're supposed to complement this class, then do so */ |
| if (complement && bmap) { |
| for (i = 0; i < 32; i++) { |
| bmap[i] = ~bmap[i]; |
| } |
| } |
| |
| return text; |
| } |
| |
| |
| |
| |
| /* This function gets the next character or meta character from a string. |
| * The pointer is incremented by 1, or by 2 for \-quoted characters. For [], |
| * a bitmap is generated via makeclass() (if re is given), and the |
| * character-class text is skipped. |
| */ |
| static int gettoken(sptr, re) |
| char **sptr; |
| regexp *re; |
| { |
| int c; |
| |
| c = **sptr; |
| ++*sptr; |
| if (c == '\\') { |
| c = **sptr; |
| ++*sptr; |
| switch (c) { |
| case '<': |
| return M_BEGWORD; |
| |
| case '>': |
| return M_ENDWORD; |
| |
| case '(': |
| if (start_cnt >= NSUBEXP) { |
| FAIL("Too many \\(s"); |
| } |
| end_stk[end_sp++] = start_cnt; |
| return M_START(start_cnt++); |
| |
| case ')': |
| if (end_sp <= 0) { |
| FAIL("Mismatched \\)"); |
| } |
| return M_END(end_stk[--end_sp]); |
| |
| case '*': |
| return M_SPLAT; |
| |
| case '.': |
| return M_ANY; |
| |
| case '+': |
| return M_PLUS; |
| |
| case '?': |
| return M_QMARK; |
| |
| default: |
| return c; |
| } |
| } else { |
| switch (c) { |
| case '^': |
| if (*sptr == retext + 1) { |
| return M_BEGLINE; |
| } |
| return c; |
| |
| case '$': |
| if (!**sptr) { |
| return M_ENDLINE; |
| } |
| return c; |
| |
| case '.': |
| return M_ANY; |
| |
| case '*': |
| return M_SPLAT; |
| |
| case '[': |
| /* make sure we don't have too many classes */ |
| if (class_cnt >= 10) { |
| FAIL("Too many []s"); |
| } |
| |
| /* process the character list for this class */ |
| if (re) { |
| /* generate the bitmap for this class */ |
| *sptr = makeclass(*sptr, re->program + 1 + 32 * class_cnt); |
| } else { |
| /* skip to end of the class */ |
| *sptr = makeclass(*sptr, (char *) 0); |
| } |
| return M_CLASS(class_cnt++); |
| |
| default: |
| return c; |
| } |
| } |
| /*NOTREACHED*/} |
| |
| |
| |
| |
| /* This function calculates the number of bytes that will be needed for a |
| * compiled RE. Its argument is the uncompiled version. It is not clever |
| * about catching syntax errors; that is done in a later pass. |
| */ |
| static unsigned calcsize(text) |
| char *text; |
| { |
| unsigned size; |
| int token; |
| |
| retext = text; |
| class_cnt = 0; |
| start_cnt = 1; |
| end_sp = 0; |
| size = 5; |
| while ((token = gettoken(&text, (regexp *) 0)) != 0) { |
| if (IS_CLASS(token)) { |
| size += 34; |
| } else if (IS_META(token)) { |
| size += 2; |
| } else { |
| size++; |
| } |
| } |
| |
| return size; |
| } |
| |
| |
| |
| /*---------------------------------------------------------------------------*/ |
| |
| |
| /* This function checks for a match between a character and a token which is |
| * known to represent a single character. It returns 0 if they match, or |
| * 1 if they don't. |
| */ |
| static int match1(regexp * re, char ch, int token, int ignoreCase) |
| { |
| if (!ch) { |
| /* the end of a line can't match any RE of width 1 */ |
| return 1; |
| } |
| if (token == M_ANY) { |
| return 0; |
| } else if (IS_CLASS(token)) { |
| if (re->program[1 + 32 * (token - M_CLASS(0)) + (ch >> 3)] & (1 << (ch & 7))) |
| return 0; |
| } |
| //fprintf(stderr, "match1: ch='%c' token='%c': ", ch, token); |
| if (ch == token || (ignoreCase == TRUE && tolower(ch) == tolower(token))) { |
| //fprintf(stderr, "match\n"); |
| return 0; |
| } |
| //fprintf(stderr, "no match\n"); |
| return 1; |
| } |
| |
| |
| |
| /* This function checks characters up to and including the next closure, at |
| * which point it does a recursive call to check the rest of it. This function |
| * returns 0 if everything matches, or 1 if something doesn't match. |
| */ |
| /* re -- the regular expression */ |
| /* str -- the string */ |
| /* prog -- a portion of re->program, an compiled RE */ |
| /* here -- a portion of str, the string to compare it to */ |
| static int match(regexp * re, char *str, char *prog, char *here, |
| int ignoreCase) |
| { |
| int token; |
| int nmatched; |
| int closure; |
| |
| for (token = GET_META(prog); !IS_CLOSURE(token); |
| prog++, token = GET_META(prog)) { |
| switch (token) { |
| /*case M_BEGLINE: can't happen; re->bol is used instead */ |
| case M_ENDLINE: |
| if (*here) |
| return 1; |
| break; |
| |
| case M_BEGWORD: |
| if (here != str && |
| (here[-1] == '_' || |
| (isascii(here[-1]) && isalnum(here[-1])))) return 1; |
| break; |
| |
| case M_ENDWORD: |
| if ((here[0] == '_' || isascii(here[0])) && isalnum(here[0])) |
| return 1; |
| break; |
| |
| case M_START(0): |
| case M_START(1): |
| case M_START(2): |
| case M_START(3): |
| case M_START(4): |
| case M_START(5): |
| case M_START(6): |
| case M_START(7): |
| case M_START(8): |
| case M_START(9): |
| re->startp[token - M_START(0)] = (char *) here; |
| break; |
| |
| case M_END(0): |
| case M_END(1): |
| case M_END(2): |
| case M_END(3): |
| case M_END(4): |
| case M_END(5): |
| case M_END(6): |
| case M_END(7): |
| case M_END(8): |
| case M_END(9): |
| re->endp[token - M_END(0)] = (char *) here; |
| if (token == M_END(0)) { |
| return 0; |
| } |
| break; |
| |
| default: /* literal, M_CLASS(n), or M_ANY */ |
| if (match1(re, *here, token, ignoreCase) != 0) |
| return 1; |
| here++; |
| } |
| } |
| |
| /* C L O S U R E */ |
| |
| /* step 1: see what we have to match against, and move "prog" to point |
| * the the remainder of the compiled RE. |
| */ |
| closure = token; |
| prog++, token = GET_META(prog); |
| prog++; |
| |
| /* step 2: see how many times we can match that token against the string */ |
| for (nmatched = 0; |
| (closure != M_QMARK || nmatched < 1) && *here |
| && match1(re, *here, token, ignoreCase) == 0; nmatched++, here++) { |
| } |
| |
| /* step 3: try to match the remainder, and back off if it doesn't */ |
| while (nmatched >= 0 && match(re, str, prog, here, ignoreCase) != 0) { |
| nmatched--; |
| here--; |
| } |
| |
| /* so how did it work out? */ |
| if (nmatched >= ((closure == M_PLUS) ? 1 : 0)) |
| return 0; |
| return 1; |
| } |
| |
| |
| /* This function compiles a regexp. */ |
| extern regexp *regcomp(char *text) |
| { |
| int needfirst; |
| unsigned size; |
| int token; |
| int peek; |
| char *build; |
| regexp *re; // Ignore compiler whining. If we longjmp, we don't use re anymore. |
| |
| |
| /* prepare for error handling */ |
| re = (regexp *) 0; |
| if (setjmp(errorhandler)) { |
| if (re) { |
| free(re); |
| } |
| return (regexp *) 0; |
| } |
| |
| /* if an empty regexp string was given, use the previous one */ |
| if (*text == 0) { |
| if (!previous) { |
| FAIL("No previous RE"); |
| } |
| text = previous; |
| } else { /* non-empty regexp given, so remember it */ |
| |
| if (previous) |
| free(previous); |
| previous = (char *) malloc((unsigned) (strlen(text) + 1)); |
| if (previous) |
| strcpy(previous, text); |
| } |
| |
| /* allocate memory */ |
| class_cnt = 0; |
| start_cnt = 1; |
| end_sp = 0; |
| retext = text; |
| size = calcsize(text) + sizeof(regexp); |
| re = (regexp *) malloc((unsigned) size); |
| |
| if (!re) { |
| FAIL("Not enough memory for this RE"); |
| } |
| |
| /* compile it */ |
| build = &re->program[1 + 32 * class_cnt]; |
| re->program[0] = class_cnt; |
| for (token = 0; token < NSUBEXP; token++) { |
| re->startp[token] = re->endp[token] = (char *) 0; |
| } |
| re->first = 0; |
| re->bol = 0; |
| re->minlen = 0; |
| needfirst = 1; |
| class_cnt = 0; |
| start_cnt = 1; |
| end_sp = 0; |
| retext = text; |
| for (token = M_START(0), peek = gettoken(&text, re); |
| token; token = peek, peek = gettoken(&text, re)) { |
| |
| /* special processing for the closure operator */ |
| if (IS_CLOSURE(peek)) { |
| |
| /* detect misuse of closure operator */ |
| if (IS_START(token)) { |
| FAIL("* or \\+ or \\? follows nothing"); |
| } else if (IS_META(token) && token != M_ANY && !IS_CLASS(token)) { |
| FAIL("* or \\+ or \\? can only follow a normal character or . or []"); |
| } |
| |
| /* it is okay -- make it prefix instead of postfix */ |
| ADD_META(build, peek); |
| |
| /* take care of "needfirst" - is this the first char? */ |
| if (needfirst && peek == M_PLUS && !IS_META(token)) { |
| re->first = token; |
| } |
| needfirst = 0; |
| |
| /* we used "peek" -- need to refill it */ |
| peek = gettoken(&text, re); |
| if (IS_CLOSURE(peek)) { |
| FAIL("* or \\+ or \\? doubled up"); |
| } |
| } else if (!IS_META(token)) { |
| /* normal char is NOT argument of closure */ |
| if (needfirst) { |
| re->first = token; |
| needfirst = 0; |
| } |
| re->minlen++; |
| } else if (token == M_ANY || IS_CLASS(token)) { |
| /* . or [] is NOT argument of closure */ |
| needfirst = 0; |
| re->minlen++; |
| } |
| |
| /* the "token" character is not closure -- process it normally */ |
| if (token == M_BEGLINE) { |
| /* set the BOL flag instead of storing M_BEGLINE */ |
| re->bol = 1; |
| } else if (IS_META(token)) { |
| ADD_META(build, token); |
| } else { |
| *build++ = token; |
| } |
| } |
| |
| /* end it with a \) which MUST MATCH the opening \( */ |
| ADD_META(build, M_END(0)); |
| if (end_sp > 0) { |
| FAIL("Not enough \\)s"); |
| } |
| |
| return re; |
| } |
| |
| |
| |
| |
| /* This function searches through a string for text that matches an RE. */ |
| /* re -- the compiled regexp to search for */ |
| /* str -- the string to search through */ |
| /* bol -- does str start at the beginning of a line? (boolean) */ |
| /* ignoreCase -- ignoreCase or not */ |
| extern int regexec(struct regexp *re, char *str, int bol, int ignoreCase) |
| { |
| char *prog; /* the entry point of re->program */ |
| int len; /* length of the string */ |
| char *here; |
| |
| /* if must start at the beginning of a line, and this isn't, then fail */ |
| if (re->bol && bol == TRUE) { |
| return FALSE; |
| } |
| |
| len = strlen(str); |
| prog = re->program + 1 + 32 * re->program[0]; |
| |
| /* search for the RE in the string */ |
| if (re->bol) { |
| /* must occur at BOL */ |
| if ((re->first && match1(re, *(char *) str, re->first, ignoreCase)) /* wrong first letter? */ |
| ||len < re->minlen /* not long enough? */ |
| || match(re, (char *) str, prog, str, ignoreCase)) /* doesn't match? */ |
| return FALSE; /* THEN FAIL! */ |
| } else if (ignoreCase == FALSE) { |
| /* can occur anywhere in the line, noignorecase */ |
| for (here = (char *) str; (re->first && re->first != *here) |
| || match(re, (char *) str, prog, here, ignoreCase); |
| here++, len--) { |
| if (len < re->minlen) |
| return FALSE; |
| } |
| } else { |
| /* can occur anywhere in the line, ignorecase */ |
| for (here = (char *) str; |
| (re->first && match1(re, *here, (int) re->first, ignoreCase)) |
| || match(re, (char *) str, prog, here, ignoreCase); |
| here++, len--) { |
| if (len < re->minlen) |
| return FALSE; |
| } |
| } |
| |
| /* if we didn't fail, then we must have succeeded */ |
| return TRUE; |
| } |
| |
| |
| |
| |
| #if defined BB_SED |
| /* This performs substitutions after a regexp match has been found. */ |
| extern void regsub(regexp * re, char *src, char *dst) |
| { |
| char *cpy; |
| char *end; |
| char c; |
| char *start; |
| int mod; |
| |
| mod = 0; |
| |
| start = src; |
| while ((c = *src++) != '\0') { |
| /* recognize any meta characters */ |
| if (c == '&') { |
| cpy = re->startp[0]; |
| end = re->endp[0]; |
| } else if (c == '~') { |
| cpy = previous1; |
| if (cpy) |
| end = cpy + strlen(cpy); |
| } else if (c == '\\') { |
| c = *src++; |
| switch (c) { |
| case '0': |
| case '1': |
| case '2': |
| case '3': |
| case '4': |
| case '5': |
| case '6': |
| case '7': |
| case '8': |
| case '9': |
| /* \0 thru \9 mean "copy subexpression" */ |
| c -= '0'; |
| cpy = re->startp[(int) c]; |
| end = re->endp[(int) c]; |
| break; |
| case 'U': |
| case 'u': |
| case 'L': |
| case 'l': |
| /* \U and \L mean "convert to upper/lowercase" */ |
| mod = c; |
| continue; |
| |
| case 'E': |
| case 'e': |
| /* \E ends the \U or \L */ |
| mod = 0; |
| continue; |
| case '&': |
| /* "\&" means "original text" */ |
| *dst++ = c; |
| continue; |
| |
| case '~': |
| /* "\~" means "previous text, if any" */ |
| *dst++ = c; |
| continue; |
| default: |
| /* ordinary char preceded by backslash */ |
| *dst++ = c; |
| continue; |
| } |
| } else { |
| /* ordinary character, so just copy it */ |
| *dst++ = c; |
| continue; |
| } |
| |
| /* Note: to reach this point in the code, we must have evaded |
| * all "continue" statements. To do that, we must have hit |
| * a metacharacter that involves copying. |
| */ |
| |
| /* if there is nothing to copy, loop */ |
| if (!cpy) |
| continue; |
| |
| /* copy over a portion of the original */ |
| while (cpy < end) { |
| switch (mod) { |
| case 'U': |
| case 'u': |
| /* convert to uppercase */ |
| if (isascii(*cpy) && islower(*cpy)) { |
| *dst++ = toupper(*cpy); |
| cpy++; |
| } else { |
| *dst++ = *cpy++; |
| } |
| break; |
| |
| case 'L': |
| case 'l': |
| /* convert to lowercase */ |
| if (isascii(*cpy) && isupper(*cpy)) { |
| *dst++ = tolower(*cpy); |
| cpy++; |
| } else { |
| *dst++ = *cpy++; |
| } |
| break; |
| |
| default: |
| /* copy without any conversion */ |
| *dst++ = *cpy++; |
| } |
| |
| /* \u and \l end automatically after the first char */ |
| if (mod && (mod == 'u' || mod == 'l')) { |
| mod = 0; |
| } |
| } |
| } |
| *dst = '\0'; |
| |
| /* remember what text we inserted this time */ |
| if (previous1) |
| free(previous1); |
| previous1 = (char *) malloc((unsigned) (strlen(start) + 1)); |
| if (previous1) |
| strcpy(previous1, start); |
| } |
| #endif |
| |
| #endif /* BB_REGEXP */ |