| /*- |
| * This code is derived from OpenBSD's libc/regex, original license follows: |
| * |
| * Copyright (c) 1992, 1993, 1994 Henry Spencer. |
| * Copyright (c) 1992, 1993, 1994 |
| * The Regents of the University of California. All rights reserved. |
| * |
| * This code is derived from software contributed to Berkeley by |
| * Henry Spencer. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in the |
| * documentation and/or other materials provided with the distribution. |
| * 3. Neither the name of the University nor the names of its contributors |
| * may be used to endorse or promote products derived from this software |
| * without specific prior written permission. |
| * |
| * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND |
| * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
| * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE |
| * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
| * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
| * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
| * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
| * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| * SUCH DAMAGE. |
| * |
| * @(#)regcomp.c 8.5 (Berkeley) 3/20/94 |
| */ |
| |
| #include <sys/types.h> |
| #include <stdint.h> |
| #include <stdio.h> |
| #include <string.h> |
| #include <ctype.h> |
| #include <limits.h> |
| #include <stdlib.h> |
| #include "regex_impl.h" |
| |
| #include "regutils.h" |
| #include "regex2.h" |
| |
| #include "llvm/Config/config.h" |
| #include "llvm/Support/Compiler.h" |
| |
| /* character-class table */ |
| static struct cclass { |
| const char *name; |
| const char *chars; |
| const char *multis; |
| } cclasses[] = { |
| { "alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ |
| 0123456789", ""} , |
| { "alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", |
| ""} , |
| { "blank", " \t", ""} , |
| { "cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\ |
| \25\26\27\30\31\32\33\34\35\36\37\177", ""} , |
| { "digit", "0123456789", ""} , |
| { "graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ |
| 0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", |
| ""} , |
| { "lower", "abcdefghijklmnopqrstuvwxyz", |
| ""} , |
| { "print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\ |
| 0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ", |
| ""} , |
| { "punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", |
| ""} , |
| { "space", "\t\n\v\f\r ", ""} , |
| { "upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", |
| ""} , |
| { "xdigit", "0123456789ABCDEFabcdef", |
| ""} , |
| { NULL, 0, "" } |
| }; |
| |
| /* character-name table */ |
| static struct cname { |
| const char *name; |
| char code; |
| } cnames[] = { |
| { "NUL", '\0' }, |
| { "SOH", '\001' }, |
| { "STX", '\002' }, |
| { "ETX", '\003' }, |
| { "EOT", '\004' }, |
| { "ENQ", '\005' }, |
| { "ACK", '\006' }, |
| { "BEL", '\007' }, |
| { "alert", '\007' }, |
| { "BS", '\010' }, |
| { "backspace", '\b' }, |
| { "HT", '\011' }, |
| { "tab", '\t' }, |
| { "LF", '\012' }, |
| { "newline", '\n' }, |
| { "VT", '\013' }, |
| { "vertical-tab", '\v' }, |
| { "FF", '\014' }, |
| { "form-feed", '\f' }, |
| { "CR", '\015' }, |
| { "carriage-return", '\r' }, |
| { "SO", '\016' }, |
| { "SI", '\017' }, |
| { "DLE", '\020' }, |
| { "DC1", '\021' }, |
| { "DC2", '\022' }, |
| { "DC3", '\023' }, |
| { "DC4", '\024' }, |
| { "NAK", '\025' }, |
| { "SYN", '\026' }, |
| { "ETB", '\027' }, |
| { "CAN", '\030' }, |
| { "EM", '\031' }, |
| { "SUB", '\032' }, |
| { "ESC", '\033' }, |
| { "IS4", '\034' }, |
| { "FS", '\034' }, |
| { "IS3", '\035' }, |
| { "GS", '\035' }, |
| { "IS2", '\036' }, |
| { "RS", '\036' }, |
| { "IS1", '\037' }, |
| { "US", '\037' }, |
| { "space", ' ' }, |
| { "exclamation-mark", '!' }, |
| { "quotation-mark", '"' }, |
| { "number-sign", '#' }, |
| { "dollar-sign", '$' }, |
| { "percent-sign", '%' }, |
| { "ampersand", '&' }, |
| { "apostrophe", '\'' }, |
| { "left-parenthesis", '(' }, |
| { "right-parenthesis", ')' }, |
| { "asterisk", '*' }, |
| { "plus-sign", '+' }, |
| { "comma", ',' }, |
| { "hyphen", '-' }, |
| { "hyphen-minus", '-' }, |
| { "period", '.' }, |
| { "full-stop", '.' }, |
| { "slash", '/' }, |
| { "solidus", '/' }, |
| { "zero", '0' }, |
| { "one", '1' }, |
| { "two", '2' }, |
| { "three", '3' }, |
| { "four", '4' }, |
| { "five", '5' }, |
| { "six", '6' }, |
| { "seven", '7' }, |
| { "eight", '8' }, |
| { "nine", '9' }, |
| { "colon", ':' }, |
| { "semicolon", ';' }, |
| { "less-than-sign", '<' }, |
| { "equals-sign", '=' }, |
| { "greater-than-sign", '>' }, |
| { "question-mark", '?' }, |
| { "commercial-at", '@' }, |
| { "left-square-bracket", '[' }, |
| { "backslash", '\\' }, |
| { "reverse-solidus", '\\' }, |
| { "right-square-bracket", ']' }, |
| { "circumflex", '^' }, |
| { "circumflex-accent", '^' }, |
| { "underscore", '_' }, |
| { "low-line", '_' }, |
| { "grave-accent", '`' }, |
| { "left-brace", '{' }, |
| { "left-curly-bracket", '{' }, |
| { "vertical-line", '|' }, |
| { "right-brace", '}' }, |
| { "right-curly-bracket", '}' }, |
| { "tilde", '~' }, |
| { "DEL", '\177' }, |
| { NULL, 0 } |
| }; |
| |
| /* |
| * parse structure, passed up and down to avoid global variables and |
| * other clumsinesses |
| */ |
| struct parse { |
| char *next; /* next character in RE */ |
| char *end; /* end of string (-> NUL normally) */ |
| int error; /* has an error been seen? */ |
| sop *strip; /* malloced strip */ |
| sopno ssize; /* malloced strip size (allocated) */ |
| sopno slen; /* malloced strip length (used) */ |
| int ncsalloc; /* number of csets allocated */ |
| struct re_guts *g; |
| # define NPAREN 10 /* we need to remember () 1-9 for back refs */ |
| sopno pbegin[NPAREN]; /* -> ( ([0] unused) */ |
| sopno pend[NPAREN]; /* -> ) ([0] unused) */ |
| }; |
| |
| static void p_ere(struct parse *, int); |
| static void p_ere_exp(struct parse *); |
| static void p_str(struct parse *); |
| static void p_bre(struct parse *, int, int); |
| static int p_simp_re(struct parse *, int); |
| static int p_count(struct parse *); |
| static void p_bracket(struct parse *); |
| static void p_b_term(struct parse *, cset *); |
| static void p_b_cclass(struct parse *, cset *); |
| static void p_b_eclass(struct parse *, cset *); |
| static char p_b_symbol(struct parse *); |
| static char p_b_coll_elem(struct parse *, int); |
| static char othercase(int); |
| static void bothcases(struct parse *, int); |
| static void ordinary(struct parse *, int); |
| static void nonnewline(struct parse *); |
| static void repeat(struct parse *, sopno, int, int); |
| static int seterr(struct parse *, int); |
| static cset *allocset(struct parse *); |
| static void freeset(struct parse *, cset *); |
| static int freezeset(struct parse *, cset *); |
| static int firstch(struct parse *, cset *); |
| static int nch(struct parse *, cset *); |
| static void mcadd(struct parse *, cset *, const char *); |
| static void mcinvert(struct parse *, cset *); |
| static void mccase(struct parse *, cset *); |
| static int isinsets(struct re_guts *, int); |
| static int samesets(struct re_guts *, int, int); |
| static void categorize(struct parse *, struct re_guts *); |
| static sopno dupl(struct parse *, sopno, sopno); |
| static void doemit(struct parse *, sop, size_t); |
| static void doinsert(struct parse *, sop, size_t, sopno); |
| static void dofwd(struct parse *, sopno, sop); |
| static void enlarge(struct parse *, sopno); |
| static void stripsnug(struct parse *, struct re_guts *); |
| static void findmust(struct parse *, struct re_guts *); |
| static sopno pluscount(struct parse *, struct re_guts *); |
| |
| static char nuls[10]; /* place to point scanner in event of error */ |
| |
| /* |
| * macros for use with parse structure |
| * BEWARE: these know that the parse structure is named `p' !!! |
| */ |
| #define PEEK() (*p->next) |
| #define PEEK2() (*(p->next+1)) |
| #define MORE() (p->end - p->next > 0) |
| #define MORE2() (p->end - p->next > 1) |
| #define SEE(c) (MORE() && PEEK() == (c)) |
| #define SEETWO(a, b) (MORE2() && PEEK() == (a) && PEEK2() == (b)) |
| #define EAT(c) ((SEE(c)) ? (NEXT(), 1) : 0) |
| #define EATTWO(a, b) ((SEETWO(a, b)) ? (NEXT2(), 1) : 0) |
| #define NEXT() (p->next++) |
| #define NEXT2() (p->next += 2) |
| #define NEXTn(n) (p->next += (n)) |
| #define GETNEXT() (*p->next++) |
| #define SETERROR(e) seterr(p, (e)) |
| #define REQUIRE(co, e) (void)((co) || SETERROR(e)) |
| #define MUSTSEE(c, e) (REQUIRE(MORE() && PEEK() == (c), e)) |
| #define MUSTEAT(c, e) (REQUIRE(MORE() && GETNEXT() == (c), e)) |
| #define MUSTNOTSEE(c, e) (REQUIRE(!MORE() || PEEK() != (c), e)) |
| #define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd)) |
| #define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos) |
| #define AHEAD(pos) dofwd(p, pos, HERE()-(pos)) |
| #define ASTERN(sop, pos) EMIT(sop, HERE()-pos) |
| #define HERE() (p->slen) |
| #define THERE() (p->slen - 1) |
| #define THERETHERE() (p->slen - 2) |
| #define DROP(n) (p->slen -= (n)) |
| |
| #ifdef _POSIX2_RE_DUP_MAX |
| #define DUPMAX _POSIX2_RE_DUP_MAX |
| #else |
| #define DUPMAX 255 |
| #endif |
| #define INFINITY (DUPMAX + 1) |
| |
| #ifndef NDEBUG |
| static int never = 0; /* for use in asserts; shuts lint up */ |
| #else |
| #define never 0 /* some <assert.h>s have bugs too */ |
| #endif |
| |
| /* |
| - llvm_regcomp - interface for parser and compilation |
| */ |
| int /* 0 success, otherwise REG_something */ |
| llvm_regcomp(llvm_regex_t *preg, const char *pattern, int cflags) |
| { |
| struct parse pa; |
| struct re_guts *g; |
| struct parse *p = &pa; |
| int i; |
| size_t len; |
| #ifdef REDEBUG |
| # define GOODFLAGS(f) (f) |
| #else |
| # define GOODFLAGS(f) ((f)&~REG_DUMP) |
| #endif |
| |
| cflags = GOODFLAGS(cflags); |
| if ((cflags®_EXTENDED) && (cflags®_NOSPEC)) |
| return(REG_INVARG); |
| |
| if (cflags®_PEND) { |
| if (preg->re_endp < pattern) |
| return(REG_INVARG); |
| len = preg->re_endp - pattern; |
| } else |
| len = strlen((const char *)pattern); |
| |
| /* do the mallocs early so failure handling is easy */ |
| g = (struct re_guts *)malloc(sizeof(struct re_guts) + |
| (NC-1)*sizeof(cat_t)); |
| if (g == NULL) |
| return(REG_ESPACE); |
| p->ssize = len/(size_t)2*(size_t)3 + (size_t)1; /* ugh */ |
| p->strip = (sop *)calloc(p->ssize, sizeof(sop)); |
| p->slen = 0; |
| if (p->strip == NULL) { |
| free((char *)g); |
| return(REG_ESPACE); |
| } |
| |
| /* set things up */ |
| p->g = g; |
| p->next = (char *)pattern; /* convenience; we do not modify it */ |
| p->end = p->next + len; |
| p->error = 0; |
| p->ncsalloc = 0; |
| for (i = 0; i < NPAREN; i++) { |
| p->pbegin[i] = 0; |
| p->pend[i] = 0; |
| } |
| g->csetsize = NC; |
| g->sets = NULL; |
| g->setbits = NULL; |
| g->ncsets = 0; |
| g->cflags = cflags; |
| g->iflags = 0; |
| g->nbol = 0; |
| g->neol = 0; |
| g->must = NULL; |
| g->mlen = 0; |
| g->nsub = 0; |
| g->ncategories = 1; /* category 0 is "everything else" */ |
| g->categories = &g->catspace[-(CHAR_MIN)]; |
| (void) memset((char *)g->catspace, 0, NC*sizeof(cat_t)); |
| g->backrefs = 0; |
| |
| /* do it */ |
| EMIT(OEND, 0); |
| g->firststate = THERE(); |
| if (cflags®_EXTENDED) |
| p_ere(p, OUT); |
| else if (cflags®_NOSPEC) |
| p_str(p); |
| else |
| p_bre(p, OUT, OUT); |
| EMIT(OEND, 0); |
| g->laststate = THERE(); |
| |
| /* tidy up loose ends and fill things in */ |
| categorize(p, g); |
| stripsnug(p, g); |
| findmust(p, g); |
| g->nplus = pluscount(p, g); |
| g->magic = MAGIC2; |
| preg->re_nsub = g->nsub; |
| preg->re_g = g; |
| preg->re_magic = MAGIC1; |
| #ifndef REDEBUG |
| /* not debugging, so can't rely on the assert() in llvm_regexec() */ |
| if (g->iflags®EX_BAD) |
| SETERROR(REG_ASSERT); |
| #endif |
| |
| /* win or lose, we're done */ |
| if (p->error != 0) /* lose */ |
| llvm_regfree(preg); |
| return(p->error); |
| } |
| |
| /* |
| - p_ere - ERE parser top level, concatenation and alternation |
| */ |
| static void |
| p_ere(struct parse *p, int stop) /* character this ERE should end at */ |
| { |
| char c; |
| sopno prevback = 0; |
| sopno prevfwd = 0; |
| sopno conc; |
| int first = 1; /* is this the first alternative? */ |
| |
| for (;;) { |
| /* do a bunch of concatenated expressions */ |
| conc = HERE(); |
| while (MORE() && (c = PEEK()) != '|' && c != stop) |
| p_ere_exp(p); |
| REQUIRE(HERE() != conc, REG_EMPTY); /* require nonempty */ |
| |
| if (!EAT('|')) |
| break; /* NOTE BREAK OUT */ |
| |
| if (first) { |
| INSERT(OCH_, conc); /* offset is wrong */ |
| prevfwd = conc; |
| prevback = conc; |
| first = 0; |
| } |
| ASTERN(OOR1, prevback); |
| prevback = THERE(); |
| AHEAD(prevfwd); /* fix previous offset */ |
| prevfwd = HERE(); |
| EMIT(OOR2, 0); /* offset is very wrong */ |
| } |
| |
| if (!first) { /* tail-end fixups */ |
| AHEAD(prevfwd); |
| ASTERN(O_CH, prevback); |
| } |
| |
| assert(!MORE() || SEE(stop)); |
| } |
| |
| /* |
| - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op |
| */ |
| static void |
| p_ere_exp(struct parse *p) |
| { |
| char c; |
| sopno pos; |
| int count; |
| int count2; |
| int backrefnum; |
| sopno subno; |
| int wascaret = 0; |
| |
| assert(MORE()); /* caller should have ensured this */ |
| c = GETNEXT(); |
| |
| pos = HERE(); |
| switch (c) { |
| case '(': |
| REQUIRE(MORE(), REG_EPAREN); |
| p->g->nsub++; |
| subno = p->g->nsub; |
| if (subno < NPAREN) |
| p->pbegin[subno] = HERE(); |
| EMIT(OLPAREN, subno); |
| if (!SEE(')')) |
| p_ere(p, ')'); |
| if (subno < NPAREN) { |
| p->pend[subno] = HERE(); |
| assert(p->pend[subno] != 0); |
| } |
| EMIT(ORPAREN, subno); |
| MUSTEAT(')', REG_EPAREN); |
| break; |
| #ifndef POSIX_MISTAKE |
| case ')': /* happens only if no current unmatched ( */ |
| /* |
| * You may ask, why the ifndef? Because I didn't notice |
| * this until slightly too late for 1003.2, and none of the |
| * other 1003.2 regular-expression reviewers noticed it at |
| * all. So an unmatched ) is legal POSIX, at least until |
| * we can get it fixed. |
| */ |
| SETERROR(REG_EPAREN); |
| break; |
| #endif |
| case '^': |
| EMIT(OBOL, 0); |
| p->g->iflags |= USEBOL; |
| p->g->nbol++; |
| wascaret = 1; |
| break; |
| case '$': |
| EMIT(OEOL, 0); |
| p->g->iflags |= USEEOL; |
| p->g->neol++; |
| break; |
| case '|': |
| SETERROR(REG_EMPTY); |
| break; |
| case '*': |
| case '+': |
| case '?': |
| SETERROR(REG_BADRPT); |
| break; |
| case '.': |
| if (p->g->cflags®_NEWLINE) |
| nonnewline(p); |
| else |
| EMIT(OANY, 0); |
| break; |
| case '[': |
| p_bracket(p); |
| break; |
| case '\\': |
| REQUIRE(MORE(), REG_EESCAPE); |
| c = GETNEXT(); |
| if (c >= '1' && c <= '9') { |
| /* \[0-9] is taken to be a back-reference to a previously specified |
| * matching group. backrefnum will hold the number. The matching |
| * group must exist (i.e. if \4 is found there must have been at |
| * least 4 matching groups specified in the pattern previously). |
| */ |
| backrefnum = c - '0'; |
| if (p->pend[backrefnum] == 0) { |
| SETERROR(REG_ESUBREG); |
| break; |
| } |
| |
| /* Make sure everything checks out and emit the sequence |
| * that marks a back-reference to the parse structure. |
| */ |
| assert(backrefnum <= p->g->nsub); |
| EMIT(OBACK_, backrefnum); |
| assert(p->pbegin[backrefnum] != 0); |
| assert(OP(p->strip[p->pbegin[backrefnum]]) == OLPAREN); |
| assert(OP(p->strip[p->pend[backrefnum]]) == ORPAREN); |
| (void) dupl(p, p->pbegin[backrefnum]+1, p->pend[backrefnum]); |
| EMIT(O_BACK, backrefnum); |
| p->g->backrefs = 1; |
| } else { |
| /* Other chars are simply themselves when escaped with a backslash. |
| */ |
| ordinary(p, c); |
| } |
| break; |
| case '{': /* okay as ordinary except if digit follows */ |
| REQUIRE(!MORE() || !isdigit((uch)PEEK()), REG_BADRPT); |
| LLVM_FALLTHROUGH; |
| default: |
| ordinary(p, c); |
| break; |
| } |
| |
| if (!MORE()) |
| return; |
| c = PEEK(); |
| /* we call { a repetition if followed by a digit */ |
| if (!( c == '*' || c == '+' || c == '?' || |
| (c == '{' && MORE2() && isdigit((uch)PEEK2())) )) |
| return; /* no repetition, we're done */ |
| NEXT(); |
| |
| REQUIRE(!wascaret, REG_BADRPT); |
| switch (c) { |
| case '*': /* implemented as +? */ |
| /* this case does not require the (y|) trick, noKLUDGE */ |
| INSERT(OPLUS_, pos); |
| ASTERN(O_PLUS, pos); |
| INSERT(OQUEST_, pos); |
| ASTERN(O_QUEST, pos); |
| break; |
| case '+': |
| INSERT(OPLUS_, pos); |
| ASTERN(O_PLUS, pos); |
| break; |
| case '?': |
| /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ |
| INSERT(OCH_, pos); /* offset slightly wrong */ |
| ASTERN(OOR1, pos); /* this one's right */ |
| AHEAD(pos); /* fix the OCH_ */ |
| EMIT(OOR2, 0); /* offset very wrong... */ |
| AHEAD(THERE()); /* ...so fix it */ |
| ASTERN(O_CH, THERETHERE()); |
| break; |
| case '{': |
| count = p_count(p); |
| if (EAT(',')) { |
| if (isdigit((uch)PEEK())) { |
| count2 = p_count(p); |
| REQUIRE(count <= count2, REG_BADBR); |
| } else /* single number with comma */ |
| count2 = INFINITY; |
| } else /* just a single number */ |
| count2 = count; |
| repeat(p, pos, count, count2); |
| if (!EAT('}')) { /* error heuristics */ |
| while (MORE() && PEEK() != '}') |
| NEXT(); |
| REQUIRE(MORE(), REG_EBRACE); |
| SETERROR(REG_BADBR); |
| } |
| break; |
| } |
| |
| if (!MORE()) |
| return; |
| c = PEEK(); |
| if (!( c == '*' || c == '+' || c == '?' || |
| (c == '{' && MORE2() && isdigit((uch)PEEK2())) ) ) |
| return; |
| SETERROR(REG_BADRPT); |
| } |
| |
| /* |
| - p_str - string (no metacharacters) "parser" |
| */ |
| static void |
| p_str(struct parse *p) |
| { |
| REQUIRE(MORE(), REG_EMPTY); |
| while (MORE()) |
| ordinary(p, GETNEXT()); |
| } |
| |
| /* |
| - p_bre - BRE parser top level, anchoring and concatenation |
| * Giving end1 as OUT essentially eliminates the end1/end2 check. |
| * |
| * This implementation is a bit of a kludge, in that a trailing $ is first |
| * taken as an ordinary character and then revised to be an anchor. The |
| * only undesirable side effect is that '$' gets included as a character |
| * category in such cases. This is fairly harmless; not worth fixing. |
| * The amount of lookahead needed to avoid this kludge is excessive. |
| */ |
| static void |
| p_bre(struct parse *p, |
| int end1, /* first terminating character */ |
| int end2) /* second terminating character */ |
| { |
| sopno start = HERE(); |
| int first = 1; /* first subexpression? */ |
| int wasdollar = 0; |
| |
| if (EAT('^')) { |
| EMIT(OBOL, 0); |
| p->g->iflags |= USEBOL; |
| p->g->nbol++; |
| } |
| while (MORE() && !SEETWO(end1, end2)) { |
| wasdollar = p_simp_re(p, first); |
| first = 0; |
| } |
| if (wasdollar) { /* oops, that was a trailing anchor */ |
| DROP(1); |
| EMIT(OEOL, 0); |
| p->g->iflags |= USEEOL; |
| p->g->neol++; |
| } |
| |
| REQUIRE(HERE() != start, REG_EMPTY); /* require nonempty */ |
| } |
| |
| /* |
| - p_simp_re - parse a simple RE, an atom possibly followed by a repetition |
| */ |
| static int /* was the simple RE an unbackslashed $? */ |
| p_simp_re(struct parse *p, |
| int starordinary) /* is a leading * an ordinary character? */ |
| { |
| int c; |
| int count; |
| int count2; |
| sopno pos; |
| int i; |
| sopno subno; |
| # define BACKSL (1<<CHAR_BIT) |
| |
| pos = HERE(); /* repetition op, if any, covers from here */ |
| |
| assert(MORE()); /* caller should have ensured this */ |
| c = GETNEXT(); |
| if (c == '\\') { |
| REQUIRE(MORE(), REG_EESCAPE); |
| c = BACKSL | GETNEXT(); |
| } |
| switch (c) { |
| case '.': |
| if (p->g->cflags®_NEWLINE) |
| nonnewline(p); |
| else |
| EMIT(OANY, 0); |
| break; |
| case '[': |
| p_bracket(p); |
| break; |
| case BACKSL|'{': |
| SETERROR(REG_BADRPT); |
| break; |
| case BACKSL|'(': |
| p->g->nsub++; |
| subno = p->g->nsub; |
| if (subno < NPAREN) |
| p->pbegin[subno] = HERE(); |
| EMIT(OLPAREN, subno); |
| /* the MORE here is an error heuristic */ |
| if (MORE() && !SEETWO('\\', ')')) |
| p_bre(p, '\\', ')'); |
| if (subno < NPAREN) { |
| p->pend[subno] = HERE(); |
| assert(p->pend[subno] != 0); |
| } |
| EMIT(ORPAREN, subno); |
| REQUIRE(EATTWO('\\', ')'), REG_EPAREN); |
| break; |
| case BACKSL|')': /* should not get here -- must be user */ |
| case BACKSL|'}': |
| SETERROR(REG_EPAREN); |
| break; |
| case BACKSL|'1': |
| case BACKSL|'2': |
| case BACKSL|'3': |
| case BACKSL|'4': |
| case BACKSL|'5': |
| case BACKSL|'6': |
| case BACKSL|'7': |
| case BACKSL|'8': |
| case BACKSL|'9': |
| i = (c&~BACKSL) - '0'; |
| assert(i < NPAREN); |
| if (p->pend[i] != 0) { |
| assert(i <= p->g->nsub); |
| EMIT(OBACK_, i); |
| assert(p->pbegin[i] != 0); |
| assert(OP(p->strip[p->pbegin[i]]) == OLPAREN); |
| assert(OP(p->strip[p->pend[i]]) == ORPAREN); |
| (void) dupl(p, p->pbegin[i]+1, p->pend[i]); |
| EMIT(O_BACK, i); |
| } else |
| SETERROR(REG_ESUBREG); |
| p->g->backrefs = 1; |
| break; |
| case '*': |
| REQUIRE(starordinary, REG_BADRPT); |
| LLVM_FALLTHROUGH; |
| default: |
| ordinary(p, (char)c); |
| break; |
| } |
| |
| if (EAT('*')) { /* implemented as +? */ |
| /* this case does not require the (y|) trick, noKLUDGE */ |
| INSERT(OPLUS_, pos); |
| ASTERN(O_PLUS, pos); |
| INSERT(OQUEST_, pos); |
| ASTERN(O_QUEST, pos); |
| } else if (EATTWO('\\', '{')) { |
| count = p_count(p); |
| if (EAT(',')) { |
| if (MORE() && isdigit((uch)PEEK())) { |
| count2 = p_count(p); |
| REQUIRE(count <= count2, REG_BADBR); |
| } else /* single number with comma */ |
| count2 = INFINITY; |
| } else /* just a single number */ |
| count2 = count; |
| repeat(p, pos, count, count2); |
| if (!EATTWO('\\', '}')) { /* error heuristics */ |
| while (MORE() && !SEETWO('\\', '}')) |
| NEXT(); |
| REQUIRE(MORE(), REG_EBRACE); |
| SETERROR(REG_BADBR); |
| } |
| } else if (c == '$') /* $ (but not \$) ends it */ |
| return(1); |
| |
| return(0); |
| } |
| |
| /* |
| - p_count - parse a repetition count |
| */ |
| static int /* the value */ |
| p_count(struct parse *p) |
| { |
| int count = 0; |
| int ndigits = 0; |
| |
| while (MORE() && isdigit((uch)PEEK()) && count <= DUPMAX) { |
| count = count*10 + (GETNEXT() - '0'); |
| ndigits++; |
| } |
| |
| REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR); |
| return(count); |
| } |
| |
| /* |
| - p_bracket - parse a bracketed character list |
| * |
| * Note a significant property of this code: if the allocset() did SETERROR, |
| * no set operations are done. |
| */ |
| static void |
| p_bracket(struct parse *p) |
| { |
| cset *cs; |
| int invert = 0; |
| |
| /* Dept of Truly Sickening Special-Case Kludges */ |
| if (p->end - p->next > 5) { |
| if (strncmp(p->next, "[:<:]]", 6) == 0) { |
| EMIT(OBOW, 0); |
| NEXTn(6); |
| return; |
| } |
| if (strncmp(p->next, "[:>:]]", 6) == 0) { |
| EMIT(OEOW, 0); |
| NEXTn(6); |
| return; |
| } |
| } |
| |
| if ((cs = allocset(p)) == NULL) { |
| /* allocset did set error status in p */ |
| return; |
| } |
| |
| if (EAT('^')) |
| invert++; /* make note to invert set at end */ |
| if (EAT(']')) |
| CHadd(cs, ']'); |
| else if (EAT('-')) |
| CHadd(cs, '-'); |
| while (MORE() && PEEK() != ']' && !SEETWO('-', ']')) |
| p_b_term(p, cs); |
| if (EAT('-')) |
| CHadd(cs, '-'); |
| MUSTEAT(']', REG_EBRACK); |
| |
| if (p->error != 0) { /* don't mess things up further */ |
| freeset(p, cs); |
| return; |
| } |
| |
| if (p->g->cflags®_ICASE) { |
| int i; |
| int ci; |
| |
| for (i = p->g->csetsize - 1; i >= 0; i--) |
| if (CHIN(cs, i) && isalpha(i)) { |
| ci = othercase(i); |
| if (ci != i) |
| CHadd(cs, ci); |
| } |
| if (cs->multis != NULL) |
| mccase(p, cs); |
| } |
| if (invert) { |
| int i; |
| |
| for (i = p->g->csetsize - 1; i >= 0; i--) |
| if (CHIN(cs, i)) |
| CHsub(cs, i); |
| else |
| CHadd(cs, i); |
| if (p->g->cflags®_NEWLINE) |
| CHsub(cs, '\n'); |
| if (cs->multis != NULL) |
| mcinvert(p, cs); |
| } |
| |
| assert(cs->multis == NULL); /* xxx */ |
| |
| if (nch(p, cs) == 1) { /* optimize singleton sets */ |
| ordinary(p, firstch(p, cs)); |
| freeset(p, cs); |
| } else |
| EMIT(OANYOF, freezeset(p, cs)); |
| } |
| |
| /* |
| - p_b_term - parse one term of a bracketed character list |
| */ |
| static void |
| p_b_term(struct parse *p, cset *cs) |
| { |
| char c; |
| char start, finish; |
| int i; |
| |
| /* classify what we've got */ |
| switch ((MORE()) ? PEEK() : '\0') { |
| case '[': |
| c = (MORE2()) ? PEEK2() : '\0'; |
| break; |
| case '-': |
| SETERROR(REG_ERANGE); |
| return; /* NOTE RETURN */ |
| break; |
| default: |
| c = '\0'; |
| break; |
| } |
| |
| switch (c) { |
| case ':': /* character class */ |
| NEXT2(); |
| REQUIRE(MORE(), REG_EBRACK); |
| c = PEEK(); |
| REQUIRE(c != '-' && c != ']', REG_ECTYPE); |
| p_b_cclass(p, cs); |
| REQUIRE(MORE(), REG_EBRACK); |
| REQUIRE(EATTWO(':', ']'), REG_ECTYPE); |
| break; |
| case '=': /* equivalence class */ |
| NEXT2(); |
| REQUIRE(MORE(), REG_EBRACK); |
| c = PEEK(); |
| REQUIRE(c != '-' && c != ']', REG_ECOLLATE); |
| p_b_eclass(p, cs); |
| REQUIRE(MORE(), REG_EBRACK); |
| REQUIRE(EATTWO('=', ']'), REG_ECOLLATE); |
| break; |
| default: /* symbol, ordinary character, or range */ |
| /* xxx revision needed for multichar stuff */ |
| start = p_b_symbol(p); |
| if (SEE('-') && MORE2() && PEEK2() != ']') { |
| /* range */ |
| NEXT(); |
| if (EAT('-')) |
| finish = '-'; |
| else |
| finish = p_b_symbol(p); |
| } else |
| finish = start; |
| /* xxx what about signed chars here... */ |
| REQUIRE(start <= finish, REG_ERANGE); |
| for (i = start; i <= finish; i++) |
| CHadd(cs, i); |
| break; |
| } |
| } |
| |
| /* |
| - p_b_cclass - parse a character-class name and deal with it |
| */ |
| static void |
| p_b_cclass(struct parse *p, cset *cs) |
| { |
| char *sp = p->next; |
| struct cclass *cp; |
| size_t len; |
| const char *u; |
| char c; |
| |
| while (MORE() && isalpha((uch)PEEK())) |
| NEXT(); |
| len = p->next - sp; |
| for (cp = cclasses; cp->name != NULL; cp++) |
| if (strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0') |
| break; |
| if (cp->name == NULL) { |
| /* oops, didn't find it */ |
| SETERROR(REG_ECTYPE); |
| return; |
| } |
| |
| u = cp->chars; |
| while ((c = *u++) != '\0') |
| CHadd(cs, c); |
| for (u = cp->multis; *u != '\0'; u += strlen(u) + 1) |
| MCadd(p, cs, u); |
| } |
| |
| /* |
| - p_b_eclass - parse an equivalence-class name and deal with it |
| * |
| * This implementation is incomplete. xxx |
| */ |
| static void |
| p_b_eclass(struct parse *p, cset *cs) |
| { |
| char c; |
| |
| c = p_b_coll_elem(p, '='); |
| CHadd(cs, c); |
| } |
| |
| /* |
| - p_b_symbol - parse a character or [..]ed multicharacter collating symbol |
| */ |
| static char /* value of symbol */ |
| p_b_symbol(struct parse *p) |
| { |
| char value; |
| |
| REQUIRE(MORE(), REG_EBRACK); |
| if (!EATTWO('[', '.')) |
| return(GETNEXT()); |
| |
| /* collating symbol */ |
| value = p_b_coll_elem(p, '.'); |
| REQUIRE(EATTWO('.', ']'), REG_ECOLLATE); |
| return(value); |
| } |
| |
| /* |
| - p_b_coll_elem - parse a collating-element name and look it up |
| */ |
| static char /* value of collating element */ |
| p_b_coll_elem(struct parse *p, |
| int endc) /* name ended by endc,']' */ |
| { |
| char *sp = p->next; |
| struct cname *cp; |
| size_t len; |
| |
| while (MORE() && !SEETWO(endc, ']')) |
| NEXT(); |
| if (!MORE()) { |
| SETERROR(REG_EBRACK); |
| return(0); |
| } |
| len = p->next - sp; |
| for (cp = cnames; cp->name != NULL; cp++) |
| if (strncmp(cp->name, sp, len) == 0 && strlen(cp->name) == len) |
| return(cp->code); /* known name */ |
| if (len == 1) |
| return(*sp); /* single character */ |
| SETERROR(REG_ECOLLATE); /* neither */ |
| return(0); |
| } |
| |
| /* |
| - othercase - return the case counterpart of an alphabetic |
| */ |
| static char /* if no counterpart, return ch */ |
| othercase(int ch) |
| { |
| ch = (uch)ch; |
| assert(isalpha(ch)); |
| if (isupper(ch)) |
| return ((uch)tolower(ch)); |
| else if (islower(ch)) |
| return ((uch)toupper(ch)); |
| else /* peculiar, but could happen */ |
| return(ch); |
| } |
| |
| /* |
| - bothcases - emit a dualcase version of a two-case character |
| * |
| * Boy, is this implementation ever a kludge... |
| */ |
| static void |
| bothcases(struct parse *p, int ch) |
| { |
| char *oldnext = p->next; |
| char *oldend = p->end; |
| char bracket[3]; |
| |
| ch = (uch)ch; |
| assert(othercase(ch) != ch); /* p_bracket() would recurse */ |
| p->next = bracket; |
| p->end = bracket+2; |
| bracket[0] = ch; |
| bracket[1] = ']'; |
| bracket[2] = '\0'; |
| p_bracket(p); |
| assert(p->next == bracket+2); |
| p->next = oldnext; |
| p->end = oldend; |
| } |
| |
| /* |
| - ordinary - emit an ordinary character |
| */ |
| static void |
| ordinary(struct parse *p, int ch) |
| { |
| cat_t *cap = p->g->categories; |
| |
| if ((p->g->cflags®_ICASE) && isalpha((uch)ch) && othercase(ch) != ch) |
| bothcases(p, ch); |
| else { |
| EMIT(OCHAR, (uch)ch); |
| if (cap[ch] == 0) |
| cap[ch] = p->g->ncategories++; |
| } |
| } |
| |
| /* |
| - nonnewline - emit REG_NEWLINE version of OANY |
| * |
| * Boy, is this implementation ever a kludge... |
| */ |
| static void |
| nonnewline(struct parse *p) |
| { |
| char *oldnext = p->next; |
| char *oldend = p->end; |
| char bracket[4]; |
| |
| p->next = bracket; |
| p->end = bracket+3; |
| bracket[0] = '^'; |
| bracket[1] = '\n'; |
| bracket[2] = ']'; |
| bracket[3] = '\0'; |
| p_bracket(p); |
| assert(p->next == bracket+3); |
| p->next = oldnext; |
| p->end = oldend; |
| } |
| |
| /* |
| - repeat - generate code for a bounded repetition, recursively if needed |
| */ |
| static void |
| repeat(struct parse *p, |
| sopno start, /* operand from here to end of strip */ |
| int from, /* repeated from this number */ |
| int to) /* to this number of times (maybe INFINITY) */ |
| { |
| sopno finish = HERE(); |
| # define N 2 |
| # define INF 3 |
| # define REP(f, t) ((f)*8 + (t)) |
| # define MAP(n) (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N) |
| sopno copy; |
| |
| if (p->error != 0) /* head off possible runaway recursion */ |
| return; |
| |
| assert(from <= to); |
| |
| switch (REP(MAP(from), MAP(to))) { |
| case REP(0, 0): /* must be user doing this */ |
| DROP(finish-start); /* drop the operand */ |
| break; |
| case REP(0, 1): /* as x{1,1}? */ |
| case REP(0, N): /* as x{1,n}? */ |
| case REP(0, INF): /* as x{1,}? */ |
| /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ |
| INSERT(OCH_, start); /* offset is wrong... */ |
| repeat(p, start+1, 1, to); |
| ASTERN(OOR1, start); |
| AHEAD(start); /* ... fix it */ |
| EMIT(OOR2, 0); |
| AHEAD(THERE()); |
| ASTERN(O_CH, THERETHERE()); |
| break; |
| case REP(1, 1): /* trivial case */ |
| /* done */ |
| break; |
| case REP(1, N): /* as x?x{1,n-1} */ |
| /* KLUDGE: emit y? as (y|) until subtle bug gets fixed */ |
| INSERT(OCH_, start); |
| ASTERN(OOR1, start); |
| AHEAD(start); |
| EMIT(OOR2, 0); /* offset very wrong... */ |
| AHEAD(THERE()); /* ...so fix it */ |
| ASTERN(O_CH, THERETHERE()); |
| copy = dupl(p, start+1, finish+1); |
| assert(copy == finish+4); |
| repeat(p, copy, 1, to-1); |
| break; |
| case REP(1, INF): /* as x+ */ |
| INSERT(OPLUS_, start); |
| ASTERN(O_PLUS, start); |
| break; |
| case REP(N, N): /* as xx{m-1,n-1} */ |
| copy = dupl(p, start, finish); |
| repeat(p, copy, from-1, to-1); |
| break; |
| case REP(N, INF): /* as xx{n-1,INF} */ |
| copy = dupl(p, start, finish); |
| repeat(p, copy, from-1, to); |
| break; |
| default: /* "can't happen" */ |
| SETERROR(REG_ASSERT); /* just in case */ |
| break; |
| } |
| } |
| |
| /* |
| - seterr - set an error condition |
| */ |
| static int /* useless but makes type checking happy */ |
| seterr(struct parse *p, int e) |
| { |
| if (p->error == 0) /* keep earliest error condition */ |
| p->error = e; |
| p->next = nuls; /* try to bring things to a halt */ |
| p->end = nuls; |
| return(0); /* make the return value well-defined */ |
| } |
| |
| /* |
| - allocset - allocate a set of characters for [] |
| */ |
| static cset * |
| allocset(struct parse *p) |
| { |
| int no = p->g->ncsets++; |
| size_t nc; |
| size_t nbytes; |
| cset *cs; |
| size_t css = (size_t)p->g->csetsize; |
| int i; |
| |
| if (no >= p->ncsalloc) { /* need another column of space */ |
| void *ptr; |
| |
| p->ncsalloc += CHAR_BIT; |
| nc = p->ncsalloc; |
| if (nc > SIZE_MAX / sizeof(cset)) |
| goto nomem; |
| assert(nc % CHAR_BIT == 0); |
| nbytes = nc / CHAR_BIT * css; |
| |
| ptr = (cset *)realloc((char *)p->g->sets, nc * sizeof(cset)); |
| if (ptr == NULL) |
| goto nomem; |
| p->g->sets = ptr; |
| |
| ptr = (uch *)realloc((char *)p->g->setbits, nbytes); |
| if (ptr == NULL) |
| goto nomem; |
| p->g->setbits = ptr; |
| |
| for (i = 0; i < no; i++) |
| p->g->sets[i].ptr = p->g->setbits + css*(i/CHAR_BIT); |
| |
| (void) memset((char *)p->g->setbits + (nbytes - css), 0, css); |
| } |
| /* XXX should not happen */ |
| if (p->g->sets == NULL || p->g->setbits == NULL) |
| goto nomem; |
| |
| cs = &p->g->sets[no]; |
| cs->ptr = p->g->setbits + css*((no)/CHAR_BIT); |
| cs->mask = 1 << ((no) % CHAR_BIT); |
| cs->hash = 0; |
| cs->smultis = 0; |
| cs->multis = NULL; |
| |
| return(cs); |
| nomem: |
| free(p->g->sets); |
| p->g->sets = NULL; |
| free(p->g->setbits); |
| p->g->setbits = NULL; |
| |
| SETERROR(REG_ESPACE); |
| /* caller's responsibility not to do set ops */ |
| return(NULL); |
| } |
| |
| /* |
| - freeset - free a now-unused set |
| */ |
| static void |
| freeset(struct parse *p, cset *cs) |
| { |
| size_t i; |
| cset *top = &p->g->sets[p->g->ncsets]; |
| size_t css = (size_t)p->g->csetsize; |
| |
| for (i = 0; i < css; i++) |
| CHsub(cs, i); |
| if (cs == top-1) /* recover only the easy case */ |
| p->g->ncsets--; |
| } |
| |
| /* |
| - freezeset - final processing on a set of characters |
| * |
| * The main task here is merging identical sets. This is usually a waste |
| * of time (although the hash code minimizes the overhead), but can win |
| * big if REG_ICASE is being used. REG_ICASE, by the way, is why the hash |
| * is done using addition rather than xor -- all ASCII [aA] sets xor to |
| * the same value! |
| */ |
| static int /* set number */ |
| freezeset(struct parse *p, cset *cs) |
| { |
| uch h = cs->hash; |
| size_t i; |
| cset *top = &p->g->sets[p->g->ncsets]; |
| cset *cs2; |
| size_t css = (size_t)p->g->csetsize; |
| |
| /* look for an earlier one which is the same */ |
| for (cs2 = &p->g->sets[0]; cs2 < top; cs2++) |
| if (cs2->hash == h && cs2 != cs) { |
| /* maybe */ |
| for (i = 0; i < css; i++) |
| if (!!CHIN(cs2, i) != !!CHIN(cs, i)) |
| break; /* no */ |
| if (i == css) |
| break; /* yes */ |
| } |
| |
| if (cs2 < top) { /* found one */ |
| freeset(p, cs); |
| cs = cs2; |
| } |
| |
| return((int)(cs - p->g->sets)); |
| } |
| |
| /* |
| - firstch - return first character in a set (which must have at least one) |
| */ |
| static int /* character; there is no "none" value */ |
| firstch(struct parse *p, cset *cs) |
| { |
| size_t i; |
| size_t css = (size_t)p->g->csetsize; |
| |
| for (i = 0; i < css; i++) |
| if (CHIN(cs, i)) |
| return((char)i); |
| assert(never); |
| return(0); /* arbitrary */ |
| } |
| |
| /* |
| - nch - number of characters in a set |
| */ |
| static int |
| nch(struct parse *p, cset *cs) |
| { |
| size_t i; |
| size_t css = (size_t)p->g->csetsize; |
| int n = 0; |
| |
| for (i = 0; i < css; i++) |
| if (CHIN(cs, i)) |
| n++; |
| return(n); |
| } |
| |
| /* |
| - mcadd - add a collating element to a cset |
| */ |
| static void |
| mcadd( struct parse *p, cset *cs, const char *cp) |
| { |
| size_t oldend = cs->smultis; |
| void *np; |
| |
| cs->smultis += strlen(cp) + 1; |
| np = realloc(cs->multis, cs->smultis); |
| if (np == NULL) { |
| if (cs->multis) |
| free(cs->multis); |
| cs->multis = NULL; |
| SETERROR(REG_ESPACE); |
| return; |
| } |
| cs->multis = np; |
| |
| llvm_strlcpy(cs->multis + oldend - 1, cp, cs->smultis - oldend + 1); |
| } |
| |
| /* |
| - mcinvert - invert the list of collating elements in a cset |
| * |
| * This would have to know the set of possibilities. Implementation |
| * is deferred. |
| */ |
| /* ARGSUSED */ |
| static void |
| mcinvert(struct parse *p, cset *cs) |
| { |
| assert(cs->multis == NULL); /* xxx */ |
| } |
| |
| /* |
| - mccase - add case counterparts of the list of collating elements in a cset |
| * |
| * This would have to know the set of possibilities. Implementation |
| * is deferred. |
| */ |
| /* ARGSUSED */ |
| static void |
| mccase(struct parse *p, cset *cs) |
| { |
| assert(cs->multis == NULL); /* xxx */ |
| } |
| |
| /* |
| - isinsets - is this character in any sets? |
| */ |
| static int /* predicate */ |
| isinsets(struct re_guts *g, int c) |
| { |
| uch *col; |
| int i; |
| int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; |
| unsigned uc = (uch)c; |
| |
| for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) |
| if (col[uc] != 0) |
| return(1); |
| return(0); |
| } |
| |
| /* |
| - samesets - are these two characters in exactly the same sets? |
| */ |
| static int /* predicate */ |
| samesets(struct re_guts *g, int c1, int c2) |
| { |
| uch *col; |
| int i; |
| int ncols = (g->ncsets+(CHAR_BIT-1)) / CHAR_BIT; |
| unsigned uc1 = (uch)c1; |
| unsigned uc2 = (uch)c2; |
| |
| for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize) |
| if (col[uc1] != col[uc2]) |
| return(0); |
| return(1); |
| } |
| |
| /* |
| - categorize - sort out character categories |
| */ |
| static void |
| categorize(struct parse *p, struct re_guts *g) |
| { |
| cat_t *cats = g->categories; |
| int c; |
| int c2; |
| cat_t cat; |
| |
| /* avoid making error situations worse */ |
| if (p->error != 0) |
| return; |
| |
| for (c = CHAR_MIN; c <= CHAR_MAX; c++) |
| if (cats[c] == 0 && isinsets(g, c)) { |
| cat = g->ncategories++; |
| cats[c] = cat; |
| for (c2 = c+1; c2 <= CHAR_MAX; c2++) |
| if (cats[c2] == 0 && samesets(g, c, c2)) |
| cats[c2] = cat; |
| } |
| } |
| |
| /* |
| - dupl - emit a duplicate of a bunch of sops |
| */ |
| static sopno /* start of duplicate */ |
| dupl(struct parse *p, |
| sopno start, /* from here */ |
| sopno finish) /* to this less one */ |
| { |
| sopno ret = HERE(); |
| sopno len = finish - start; |
| |
| assert(finish >= start); |
| if (len == 0) |
| return(ret); |
| enlarge(p, p->ssize + len); /* this many unexpected additions */ |
| assert(p->ssize >= p->slen + len); |
| (void) memmove((char *)(p->strip + p->slen), |
| (char *)(p->strip + start), (size_t)len*sizeof(sop)); |
| p->slen += len; |
| return(ret); |
| } |
| |
| /* |
| - doemit - emit a strip operator |
| * |
| * It might seem better to implement this as a macro with a function as |
| * hard-case backup, but it's just too big and messy unless there are |
| * some changes to the data structures. Maybe later. |
| */ |
| static void |
| doemit(struct parse *p, sop op, size_t opnd) |
| { |
| /* avoid making error situations worse */ |
| if (p->error != 0) |
| return; |
| |
| /* deal with oversize operands ("can't happen", more or less) */ |
| assert(opnd < 1<<OPSHIFT); |
| |
| /* deal with undersized strip */ |
| if (p->slen >= p->ssize) |
| enlarge(p, (p->ssize+1) / 2 * 3); /* +50% */ |
| assert(p->slen < p->ssize); |
| |
| /* finally, it's all reduced to the easy case */ |
| p->strip[p->slen++] = SOP(op, opnd); |
| } |
| |
| /* |
| - doinsert - insert a sop into the strip |
| */ |
| static void |
| doinsert(struct parse *p, sop op, size_t opnd, sopno pos) |
| { |
| sopno sn; |
| sop s; |
| int i; |
| |
| /* avoid making error situations worse */ |
| if (p->error != 0) |
| return; |
| |
| sn = HERE(); |
| EMIT(op, opnd); /* do checks, ensure space */ |
| assert(HERE() == sn+1); |
| s = p->strip[sn]; |
| |
| /* adjust paren pointers */ |
| assert(pos > 0); |
| for (i = 1; i < NPAREN; i++) { |
| if (p->pbegin[i] >= pos) { |
| p->pbegin[i]++; |
| } |
| if (p->pend[i] >= pos) { |
| p->pend[i]++; |
| } |
| } |
| |
| memmove((char *)&p->strip[pos+1], (char *)&p->strip[pos], |
| (HERE()-pos-1)*sizeof(sop)); |
| p->strip[pos] = s; |
| } |
| |
| /* |
| - dofwd - complete a forward reference |
| */ |
| static void |
| dofwd(struct parse *p, sopno pos, sop value) |
| { |
| /* avoid making error situations worse */ |
| if (p->error != 0) |
| return; |
| |
| assert(value < 1<<OPSHIFT); |
| p->strip[pos] = OP(p->strip[pos]) | value; |
| } |
| |
| /* |
| - enlarge - enlarge the strip |
| */ |
| static void |
| enlarge(struct parse *p, sopno size) |
| { |
| sop *sp; |
| |
| if (p->ssize >= size) |
| return; |
| |
| if ((uintptr_t)size > SIZE_MAX / sizeof(sop)) { |
| SETERROR(REG_ESPACE); |
| return; |
| } |
| |
| sp = (sop *)realloc(p->strip, size*sizeof(sop)); |
| if (sp == NULL) { |
| SETERROR(REG_ESPACE); |
| return; |
| } |
| p->strip = sp; |
| p->ssize = size; |
| } |
| |
| /* |
| - stripsnug - compact the strip |
| */ |
| static void |
| stripsnug(struct parse *p, struct re_guts *g) |
| { |
| g->nstates = p->slen; |
| if ((uintptr_t)p->slen > SIZE_MAX / sizeof(sop)) { |
| g->strip = p->strip; |
| SETERROR(REG_ESPACE); |
| return; |
| } |
| |
| g->strip = (sop *)realloc((char *)p->strip, p->slen * sizeof(sop)); |
| if (g->strip == NULL) { |
| SETERROR(REG_ESPACE); |
| g->strip = p->strip; |
| } |
| } |
| |
| /* |
| - findmust - fill in must and mlen with longest mandatory literal string |
| * |
| * This algorithm could do fancy things like analyzing the operands of | |
| * for common subsequences. Someday. This code is simple and finds most |
| * of the interesting cases. |
| * |
| * Note that must and mlen got initialized during setup. |
| */ |
| static void |
| findmust(struct parse *p, struct re_guts *g) |
| { |
| sop *scan; |
| sop *start = 0; /* start initialized in the default case, after that */ |
| sop *newstart = 0; /* newstart was initialized in the OCHAR case */ |
| sopno newlen; |
| sop s; |
| char *cp; |
| sopno i; |
| |
| /* avoid making error situations worse */ |
| if (p->error != 0) |
| return; |
| |
| /* find the longest OCHAR sequence in strip */ |
| newlen = 0; |
| scan = g->strip + 1; |
| do { |
| s = *scan++; |
| switch (OP(s)) { |
| case OCHAR: /* sequence member */ |
| if (newlen == 0) /* new sequence */ |
| newstart = scan - 1; |
| newlen++; |
| break; |
| case OPLUS_: /* things that don't break one */ |
| case OLPAREN: |
| case ORPAREN: |
| break; |
| case OQUEST_: /* things that must be skipped */ |
| case OCH_: |
| scan--; |
| do { |
| scan += OPND(s); |
| s = *scan; |
| /* assert() interferes w debug printouts */ |
| if (OP(s) != O_QUEST && OP(s) != O_CH && |
| OP(s) != OOR2) { |
| g->iflags |= REGEX_BAD; |
| return; |
| } |
| } while (OP(s) != O_QUEST && OP(s) != O_CH); |
| LLVM_FALLTHROUGH; |
| default: /* things that break a sequence */ |
| if (newlen > g->mlen) { /* ends one */ |
| start = newstart; |
| g->mlen = newlen; |
| } |
| newlen = 0; |
| break; |
| } |
| } while (OP(s) != OEND); |
| |
| if (g->mlen == 0) /* there isn't one */ |
| return; |
| |
| /* turn it into a character string */ |
| g->must = malloc((size_t)g->mlen + 1); |
| if (g->must == NULL) { /* argh; just forget it */ |
| g->mlen = 0; |
| return; |
| } |
| cp = g->must; |
| scan = start; |
| for (i = g->mlen; i > 0; i--) { |
| while (OP(s = *scan++) != OCHAR) |
| continue; |
| assert(cp < g->must + g->mlen); |
| *cp++ = (char)OPND(s); |
| } |
| assert(cp == g->must + g->mlen); |
| *cp++ = '\0'; /* just on general principles */ |
| } |
| |
| /* |
| - pluscount - count + nesting |
| */ |
| static sopno /* nesting depth */ |
| pluscount(struct parse *p, struct re_guts *g) |
| { |
| sop *scan; |
| sop s; |
| sopno plusnest = 0; |
| sopno maxnest = 0; |
| |
| if (p->error != 0) |
| return(0); /* there may not be an OEND */ |
| |
| scan = g->strip + 1; |
| do { |
| s = *scan++; |
| switch (OP(s)) { |
| case OPLUS_: |
| plusnest++; |
| break; |
| case O_PLUS: |
| if (plusnest > maxnest) |
| maxnest = plusnest; |
| plusnest--; |
| break; |
| } |
| } while (OP(s) != OEND); |
| if (plusnest != 0) |
| g->iflags |= REGEX_BAD; |
| return(maxnest); |
| } |