Logo Search packages:      
Sourcecode: tcl8.5 version File versions  Download package

regcomp.c

/*
 * re_*comp and friends - compile REs
 * This file #includes several others (see the bottom).
 *
 * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
 *
 * Development of this software was funded, in part, by Cray Research Inc.,
 * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
 * Corporation, none of whom are responsible for the results. The author
 * thanks all of them.
 *
 * Redistribution and use in source and binary forms -- with or without
 * modification -- are permitted for any purpose, provided that
 * redistributions in source form retain this entire copyright notice and
 * indicate the origin and nature of any modifications.
 *
 * I'd appreciate being given credit for this package in the documentation of
 * software which uses it, but that is not a requirement.
 *
 * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
 * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
 * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
 * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
 * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
 * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
 * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
 * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
 * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
 * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *
 */

#include "regguts.h"

/*
 * forward declarations, up here so forward datatypes etc. are defined early
 */
/* =====^!^===== begin forwards =====^!^===== */
/* automatically gathered by fwd; do not hand-edit */
/* === regcomp.c === */
int compile(regex_t *, const chr *, size_t, int);
static void moresubs(struct vars *, int);
static int freev(struct vars *, int);
static void makesearch(struct vars *, struct nfa *);
static struct subre *parse(struct vars *, int, int, struct state *, struct state *);
static struct subre *parsebranch(struct vars *, int, int, struct state *, struct state *, int);
static void parseqatom(struct vars *, int, int, struct state *, struct state *, struct subre *);
static void nonword(struct vars *, int, struct state *, struct state *);
static void word(struct vars *, int, struct state *, struct state *);
static int scannum(struct vars *);
static void repeat(struct vars *, struct state *, struct state *, int, int);
static void bracket(struct vars *, struct state *, struct state *);
static void cbracket(struct vars *, struct state *, struct state *);
static void brackpart(struct vars *, struct state *, struct state *);
static const chr *scanplain(struct vars *);
static void onechr(struct vars *, pchr, struct state *, struct state *);
static void dovec(struct vars *, struct cvec *, struct state *, struct state *);
static void wordchrs(struct vars *);
static struct subre *subre(struct vars *, int, int, struct state *, struct state *);
static void freesubre(struct vars *, struct subre *);
static void freesrnode(struct vars *, struct subre *);
static void optst(struct vars *, struct subre *);
static int numst(struct subre *, int);
static void markst(struct subre *);
static void cleanst(struct vars *);
static long nfatree(struct vars *, struct subre *, FILE *);
static long nfanode(struct vars *, struct subre *, FILE *);
static int newlacon(struct vars *, struct state *, struct state *, int);
static void freelacons(struct subre *, int);
static void rfree(regex_t *);
static void dump(regex_t *, FILE *);
static void dumpst(struct subre *, FILE *, int);
static void stdump(struct subre *, FILE *, int);
static const char *stid(struct subre *, char *, size_t);
/* === regc_lex.c === */
static void lexstart(struct vars *);
static void prefixes(struct vars *);
static void lexnest(struct vars *, const chr *, const chr *);
static void lexword(struct vars *);
static int next(struct vars *);
static int lexescape(struct vars *);
static chr lexdigits(struct vars *, int, int, int);
static int brenext(struct vars *, pchr);
static void skip(struct vars *);
static chr newline(NOPARMS);
#ifdef REG_DEBUG
static const chr *ch(NOPARMS);
#endif
static chr chrnamed(struct vars *, const chr *, const chr *, pchr);
/* === regc_color.c === */
static void initcm(struct vars *, struct colormap *);
static void freecm(struct colormap *);
static void cmtreefree(struct colormap *, union tree *, int);
static color setcolor(struct colormap *, pchr, pcolor);
static color maxcolor(struct colormap *);
static color newcolor(struct colormap *);
static void freecolor(struct colormap *, pcolor);
static color pseudocolor(struct colormap *);
static color subcolor(struct colormap *, pchr c);
static color newsub(struct colormap *, pcolor);
static void subrange(struct vars *, pchr, pchr, struct state *, struct state *);
static void subblock(struct vars *, pchr, struct state *, struct state *);
static void okcolors(struct nfa *, struct colormap *);
static void colorchain(struct colormap *, struct arc *);
static void uncolorchain(struct colormap *, struct arc *);
static void rainbow(struct nfa *, struct colormap *, int, pcolor, struct state *, struct state *);
static void colorcomplement(struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *);
#ifdef REG_DEBUG
static void dumpcolors(struct colormap *, FILE *);
static void fillcheck(struct colormap *, union tree *, int, FILE *);
static void dumpchr(pchr, FILE *);
#endif
/* === regc_nfa.c === */
static struct nfa *newnfa(struct vars *, struct colormap *, struct nfa *);
static void freenfa(struct nfa *);
static struct state *newstate(struct nfa *);
static struct state *newfstate(struct nfa *, int flag);
static void dropstate(struct nfa *, struct state *);
static void freestate(struct nfa *, struct state *);
static void destroystate(struct nfa *, struct state *);
static void newarc(struct nfa *, int, pcolor, struct state *, struct state *);
static struct arc *allocarc(struct nfa *, struct state *);
static void freearc(struct nfa *, struct arc *);
static struct arc *findarc(struct state *, int, pcolor);
static void cparc(struct nfa *, struct arc *, struct state *, struct state *);
static void moveins(struct nfa *, struct state *, struct state *);
static void copyins(struct nfa *, struct state *, struct state *);
static void moveouts(struct nfa *, struct state *, struct state *);
static void copyouts(struct nfa *, struct state *, struct state *);
static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int);
static void delsub(struct nfa *, struct state *, struct state *);
static void deltraverse(struct nfa *, struct state *, struct state *);
static void dupnfa(struct nfa *, struct state *, struct state *, struct state *, struct state *);
static void duptraverse(struct nfa *, struct state *, struct state *);
static void cleartraverse(struct nfa *, struct state *);
static void specialcolors(struct nfa *);
static long optimize(struct nfa *, FILE *);
static void pullback(struct nfa *, FILE *);
static int pull(struct nfa *, struct arc *);
static void pushfwd(struct nfa *, FILE *);
static int push(struct nfa *, struct arc *);
#define     INCOMPATIBLE      1     /* destroys arc */
#define     SATISFIED   2     /* constraint satisfied */
#define     COMPATIBLE  3     /* compatible but not satisfied yet */
static int combine(struct arc *, struct arc *);
static void fixempties(struct nfa *, FILE *);
static int unempty(struct nfa *, struct arc *);
static void cleanup(struct nfa *);
static void markreachable(struct nfa *, struct state *, struct state *, struct state *);
static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);
static long analyze(struct nfa *);
static void compact(struct nfa *, struct cnfa *);
static void carcsort(struct carc *, struct carc *);
static void freecnfa(struct cnfa *);
static void dumpnfa(struct nfa *, FILE *);
#ifdef REG_DEBUG
static void dumpstate(struct state *, FILE *);
static void dumparcs(struct state *, FILE *);
static int dumprarcs(struct arc *, struct state *, FILE *, int);
static void dumparc(struct arc *, struct state *, FILE *);
#endif
static void dumpcnfa(struct cnfa *, FILE *);
#ifdef REG_DEBUG
static void dumpcstate(int, struct carc *, struct cnfa *, FILE *);
#endif
/* === regc_cvec.c === */
static struct cvec *clearcvec(struct cvec *);
static void addchr(struct cvec *, pchr);
static void addrange(struct cvec *, pchr, pchr);
static struct cvec *newcvec(int, int);
static struct cvec *getcvec(struct vars *, int, int);
static void freecvec(struct cvec *);
/* === regc_locale.c === */
static celt element(struct vars *, const chr *, const chr *);
static struct cvec *range(struct vars *, celt, celt, int);
static int before(celt, celt);
static struct cvec *eclass(struct vars *, celt, int);
static struct cvec *cclass(struct vars *, const chr *, const chr *, int);
static struct cvec *allcases(struct vars *, pchr);
static int cmp(const chr *, const chr *, size_t);
static int casecmp(const chr *, const chr *, size_t);
/* automatically gathered by fwd; do not hand-edit */
/* =====^!^===== end forwards =====^!^===== */

/* internal variables, bundled for easy passing around */
struct vars {
    regex_t *re;
    const chr *now;           /* scan pointer into string */
    const chr *stop;          /* end of string */
    const chr *savenow;       /* saved now and stop for "subroutine call" */
    const chr *savestop;
    int err;                  /* error code (0 if none) */
    int cflags;               /* copy of compile flags */
    int lasttype;       /* type of previous token */
    int nexttype;       /* type of next token */
    chr nextvalue;            /* value (if any) of next token */
    int lexcon;               /* lexical context type (see lex.c) */
    int nsubexp;        /* subexpression count */
    struct subre **subs;      /* subRE pointer vector */
    size_t nsubs;       /* length of vector */
    struct subre *sub10[10];  /* initial vector, enough for most */
    struct nfa *nfa;          /* the NFA */
    struct colormap *cm;      /* character color map */
    color nlcolor;            /* color of newline */
    struct state *wordchrs;   /* state in nfa holding word-char outarcs */
    struct subre *tree;       /* subexpression tree */
    struct subre *treechain;  /* all tree nodes allocated */
    struct subre *treefree;   /* any free tree nodes */
    int ntree;                /* number of tree nodes */
    struct cvec *cv;          /* interface cvec */
    struct cvec *cv2;         /* utility cvec */
    struct subre *lacons;     /* lookahead-constraint vector */
    int nlacons;        /* size of lacons */
};

/* parsing macros; most know that `v' is the struct vars pointer */
#define     NEXT()      (next(v))         /* advance by one token */
#define     SEE(t)      (v->nexttype == (t))    /* is next token this? */
#define     EAT(t)      (SEE(t) && next(v))     /* if next is this, swallow it */
#define     VISERR(vv)  ((vv)->err != 0)/* have we seen an error yet? */
#define     ISERR()     VISERR(v)
#define     VERR(vv,e) \
      ((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err : ((vv)->err = (e)))
#define     ERR(e)      VERR(v, e)        /* record an error */
#define     NOERR()     {if (ISERR()) return;}  /* if error seen, return */
#define     NOERRN()    {if (ISERR()) return NULL;}   /* NOERR with retval */
#define     NOERRZ()    {if (ISERR()) return 0;}      /* NOERR with retval */
#define     INSIST(c, e)      ((c) ? 0 : ERR(e))      /* if condition false, error */
#define     NOTE(b)     (v->re->re_info |= (b))       /* note visible condition */
#define     EMPTYARC(x, y)    newarc(v->nfa, EMPTY, 0, x, y)

/* token type codes, some also used as NFA arc types */
#define     EMPTY 'n'         /* no token present */
#define     EOS   'e'         /* end of string */
#define     PLAIN 'p'         /* ordinary character */
#define     DIGIT 'd'         /* digit (in bound) */
#define     BACKREF     'b'         /* back reference */
#define     COLLEL      'I'         /* start of [. */
#define     ECLASS      'E'         /* start of [= */
#define     CCLASS      'C'         /* start of [: */
#define     END   'X'         /* end of [. [= [: */
#define     RANGE 'R'         /* - within [] which might be range delim. */
#define     LACON 'L'         /* lookahead constraint subRE */
#define     AHEAD 'a'         /* color-lookahead arc */
#define     BEHIND      'r'         /* color-lookbehind arc */
#define     WBDRY 'w'         /* word boundary constraint */
#define     NWBDRY      'W'         /* non-word-boundary constraint */
#define     SBEGIN      'A'         /* beginning of string (even if not BOL) */
#define     SEND  'Z'         /* end of string (even if not EOL) */
#define     PREFER      'P'         /* length preference */

/* is an arc colored, and hence on a color chain? */
#define     COLORED(a) \
      ((a)->type == PLAIN || (a)->type == AHEAD || (a)->type == BEHIND)

/* static function list */
static struct fns functions = {
    rfree,              /* regfree insides */
};

/*
 - compile - compile regular expression
 ^ int compile(regex_t *, const chr *, size_t, int);
 */
int
compile(
    regex_t *re,
    const chr *string,
    size_t len,
    int flags)
{
    AllocVars(v);
    struct guts *g;
    int i;
    size_t j;
    FILE *debug = (flags&REG_PROGRESS) ? stdout : NULL;
#define     CNOERR()    { if (ISERR()) return freev(v, v->err); }

    /*
     * Sanity checks.
     */

    if (re == NULL || string == NULL) {
      FreeVars(v);
      return REG_INVARG;
    }
    if ((flags&REG_QUOTE) && (flags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE))) {
      FreeVars(v);
      return REG_INVARG;
    }
    if (!(flags&REG_EXTENDED) && (flags&REG_ADVF)) {
      FreeVars(v);
      return REG_INVARG;
    }

    /*
     * Initial setup (after which freev() is callable).
     */

    v->re = re;
    v->now = string;
    v->stop = v->now + len;
    v->savenow = v->savestop = NULL;
    v->err = 0;
    v->cflags = flags;
    v->nsubexp = 0;
    v->subs = v->sub10;
    v->nsubs = 10;
    for (j = 0; j < v->nsubs; j++) {
      v->subs[j] = NULL;
    }
    v->nfa = NULL;
    v->cm = NULL;
    v->nlcolor = COLORLESS;
    v->wordchrs = NULL;
    v->tree = NULL;
    v->treechain = NULL;
    v->treefree = NULL;
    v->cv = NULL;
    v->cv2 = NULL;
    v->lacons = NULL;
    v->nlacons = 0;
    re->re_magic = REMAGIC;
    re->re_info = 0;          /* bits get set during parse */
    re->re_csize = sizeof(chr);
    re->re_guts = NULL;
    re->re_fns = VS(&functions);

    /*
     * More complex setup, malloced things.
     */

    re->re_guts = VS(MALLOC(sizeof(struct guts)));
    if (re->re_guts == NULL) {
      return freev(v, REG_ESPACE);
    }
    g = (struct guts *) re->re_guts;
    g->tree = NULL;
    initcm(v, &g->cmap);
    v->cm = &g->cmap;
    g->lacons = NULL;
    g->nlacons = 0;
    ZAPCNFA(g->search);
    v->nfa = newnfa(v, v->cm, NULL);
    CNOERR();
    v->cv = newcvec(100, 20);
    if (v->cv == NULL) {
      return freev(v, REG_ESPACE);
    }

    /*
     * Parsing.
     */

    lexstart(v);        /* also handles prefixes */
    if ((v->cflags&REG_NLSTOP) || (v->cflags&REG_NLANCH)) {
      /*
       * Assign newline a unique color.
       */

      v->nlcolor = subcolor(v->cm, newline());
      okcolors(v->nfa, v->cm);
    }
    CNOERR();
    v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final);
    assert(SEE(EOS));         /* even if error; ISERR() => SEE(EOS) */
    CNOERR();
    assert(v->tree != NULL);

    /*
     * Finish setup of nfa and its subre tree.
     */

    specialcolors(v->nfa);
    CNOERR();
    if (debug != NULL) {
      fprintf(debug, "\n\n\n========= RAW ==========\n");
      dumpnfa(v->nfa, debug);
      dumpst(v->tree, debug, 1);
    }
    optst(v, v->tree);
    v->ntree = numst(v->tree, 1);
    markst(v->tree);
    cleanst(v);
    if (debug != NULL) {
      fprintf(debug, "\n\n\n========= TREE FIXED ==========\n");
      dumpst(v->tree, debug, 1);
    }

    /*
     * Build compacted NFAs for tree and lacons.
     */

    re->re_info |= nfatree(v, v->tree, debug);
    CNOERR();
    assert(v->nlacons == 0 || v->lacons != NULL);
    for (i = 1; i < v->nlacons; i++) {
      if (debug != NULL) {
          fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
      }
      nfanode(v, &v->lacons[i], debug);
    }
    CNOERR();
    if (v->tree->flags&SHORTER) {
      NOTE(REG_USHORTEST);
    }

    /*
     * Build compacted NFAs for tree, lacons, fast search.
     */

    if (debug != NULL) {
      fprintf(debug, "\n\n\n========= SEARCH ==========\n");
    }

    /*
     * Can sacrifice main NFA now, so use it as work area.
     */

    (DISCARD) optimize(v->nfa, debug);
    CNOERR();
    makesearch(v, v->nfa);
    CNOERR();
    compact(v->nfa, &g->search);
    CNOERR();

    /*
     * Looks okay, package it up.
     */

    re->re_nsub = v->nsubexp;
    v->re = NULL;       /* freev no longer frees re */
    g->magic = GUTSMAGIC;
    g->cflags = v->cflags;
    g->info = re->re_info;
    g->nsub = re->re_nsub;
    g->tree = v->tree;
    v->tree = NULL;
    g->ntree = v->ntree;
    g->compare = (v->cflags&REG_ICASE) ? casecmp : cmp;
    g->lacons = v->lacons;
    v->lacons = NULL;
    g->nlacons = v->nlacons;

    if (flags&REG_DUMP) {
      dump(re, stdout);
    }

    assert(v->err == 0);
    return freev(v, 0);
}

/*
 - moresubs - enlarge subRE vector
 ^ static void moresubs(struct vars *, int);
 */
static void
moresubs(
    struct vars *v,
    int wanted)               /* want enough room for this one */
{
    struct subre **p;
    size_t n;

    assert(wanted > 0 && (size_t)wanted >= v->nsubs);
    n = (size_t)wanted * 3 / 2 + 1;
    if (v->subs == v->sub10) {
      p = (struct subre **) MALLOC(n * sizeof(struct subre *));
      if (p != NULL) {
          memcpy(p, v->subs, v->nsubs * sizeof(struct subre *));
      }
    } else {
      p = (struct subre **) REALLOC(v->subs, n*sizeof(struct subre *));
    }
    if (p == NULL) {
      ERR(REG_ESPACE);
      return;
    }

    v->subs = p;
    for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++) {
      *p = NULL;
    }
    assert(v->nsubs == n);
    assert((size_t)wanted < v->nsubs);
}

/*
 - freev - free vars struct's substructures where necessary
 * Optionally does error-number setting, and always returns error code (if
 * any), to make error-handling code terser.
 ^ static int freev(struct vars *, int);
 */
static int
freev(
    struct vars *v,
    int err)
{
    register int ret;

    if (v->re != NULL) {
      rfree(v->re);
    }
    if (v->subs != v->sub10) {
      FREE(v->subs);
    }
    if (v->nfa != NULL) {
      freenfa(v->nfa);
    }
    if (v->tree != NULL) {
      freesubre(v, v->tree);
    }
    if (v->treechain != NULL) {
      cleanst(v);
    }
    if (v->cv != NULL) {
      freecvec(v->cv);
    }
    if (v->cv2 != NULL) {
      freecvec(v->cv2);
    }
    if (v->lacons != NULL) {
      freelacons(v->lacons, v->nlacons);
    }
    ERR(err);                 /* nop if err==0 */

    ret = v->err;
    FreeVars(v);
    return ret;
}

/*
 - makesearch - turn an NFA into a search NFA (implicit prepend of .*?)
 * NFA must have been optimize()d already.
 ^ static void makesearch(struct vars *, struct nfa *);
 */
static void
makesearch(
    struct vars *v,
    struct nfa *nfa)
{
    struct arc *a, *b;
    struct state *pre = nfa->pre;
    struct state *s, *s2, *slist;

    /*
     * No loops are needed if it's anchored.
     */

    for (a = pre->outs; a != NULL; a = a->outchain) {
      assert(a->type == PLAIN);
      if (a->co != nfa->bos[0] && a->co != nfa->bos[1]) {
          break;
      }
    }
    if (a != NULL) {
      /*
       * Add implicit .* in front.
       */

      rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre);

      /*
       * And ^* and \A* too -- not always necessary, but harmless.
       */

      newarc(nfa, PLAIN, nfa->bos[0], pre, pre);
      newarc(nfa, PLAIN, nfa->bos[1], pre, pre);
    }

    /*
     * Now here's the subtle part. Because many REs have no lookback
     * constraints, often knowing when you were in the pre state tells you
     * little; it's the next state(s) that are informative. But some of them
     * may have other inarcs, i.e. it may be possible to make actual progress
     * and then return to one of them. We must de-optimize such cases,
     * splitting each such state into progress and no-progress states.
     */

    /*
     * First, make a list of the states.
     */

    slist = NULL;
    for (a=pre->outs ; a!=NULL ; a=a->outchain) {
      s = a->to;
      for (b=s->ins ; b!=NULL ; b=b->inchain) {
          if (b->from != pre) {
            break;
          }
      }
      if (b != NULL && s->tmp == NULL) {
          /*
           * Must be split if not already in the list (fixes bugs 505048,
           * 230589, 840258, 504785).
           */

          s->tmp = slist;
          slist = s;
      }
    }

    /*
     * Do the splits.
     */

    for (s=slist ; s!=NULL ; s=s2) {
      s2 = newstate(nfa);

      copyouts(nfa, s, s2);
      for (a=s->ins ; a!=NULL ; a=b) {
          b = a->inchain;

          if (a->from != pre) {
            cparc(nfa, a, a->from, s2);
            freearc(nfa, a);
          }
      }
      s2 = s->tmp;
      s->tmp = NULL;          /* clean up while we're at it */
    }
}

/*
 - parse - parse an RE
 * This is actually just the top level, which parses a bunch of branches tied
 * together with '|'. They appear in the tree as the left children of a chain
 * of '|' subres.
 ^ static struct subre *parse(struct vars *, int, int, struct state *,
 ^    struct state *);
 */
static struct subre *
parse(
    struct vars *v,
    int stopper,        /* EOS or ')' */
    int type,                 /* LACON (lookahead subRE) or PLAIN */
    struct state *init,       /* initial state */
    struct state *final)      /* final state */
{
    struct state *left, *right;     /* scaffolding for branch */
    struct subre *branches;   /* top level */
    struct subre *branch;     /* current branch */
    struct subre *t;          /* temporary */
    int firstbranch;          /* is this the first branch? */

    assert(stopper == ')' || stopper == EOS);

    branches = subre(v, '|', LONGER, init, final);
    NOERRN();
    branch = branches;
    firstbranch = 1;
    do {    /* a branch */
      if (!firstbranch) {
          /*
           * Need a place to hang the branch.
           */

          branch->right = subre(v, '|', LONGER, init, final);
          NOERRN();
          branch = branch->right;
      }
      firstbranch = 0;
      left = newstate(v->nfa);
      right = newstate(v->nfa);
      NOERRN();
      EMPTYARC(init, left);
      EMPTYARC(right, final);
      NOERRN();
      branch->left = parsebranch(v, stopper, type, left, right, 0);
      NOERRN();
      branch->flags |= UP(branch->flags | branch->left->flags);
      if ((branch->flags &~ branches->flags) != 0) {  /* new flags */
          for (t = branches; t != branch; t = t->right) {
            t->flags |= branch->flags;
          }
      }
    } while (EAT('|'));
    assert(SEE(stopper) || SEE(EOS));

    if (!SEE(stopper)) {
      assert(stopper == ')' && SEE(EOS));
      ERR(REG_EPAREN);
    }

    /*
     * Optimize out simple cases.
     */

    if (branch == branches) { /* only one branch */
      assert(branch->right == NULL);
      t = branch->left;
      branch->left = NULL;
      freesubre(v, branches);
      branches = t;
    } else if (!MESSY(branches->flags)) { /* no interesting innards */
      freesubre(v, branches->left);
      branches->left = NULL;
      freesubre(v, branches->right);
      branches->right = NULL;
      branches->op = '=';
    }

    return branches;
}

/*
 - parsebranch - parse one branch of an RE
 * This mostly manages concatenation, working closely with parseqatom().
 * Concatenated things are bundled up as much as possible, with separate
 * ',' nodes introduced only when necessary due to substructure.
 ^ static struct subre *parsebranch(struct vars *, int, int, struct state *,
 ^    struct state *, int);
 */
static struct subre *
parsebranch(
    struct vars *v,
    int stopper,        /* EOS or ')' */
    int type,                 /* LACON (lookahead subRE) or PLAIN */
    struct state *left,       /* leftmost state */
    struct state *right,      /* rightmost state */
    int partial)        /* is this only part of a branch? */
{
    struct state *lp;         /* left end of current construct */
    int seencontent;          /* is there anything in this branch yet? */
    struct subre *t;

    lp = left;
    seencontent = 0;
    t = subre(v, '=', 0, left, right);    /* op '=' is tentative */
    NOERRN();
    while (!SEE('|') && !SEE(stopper) && !SEE(EOS)) {
      if (seencontent) {      /* implicit concat operator */
          lp = newstate(v->nfa);
          NOERRN();
          moveins(v->nfa, right, lp);
      }
      seencontent = 1;

      /* NB, recursion in parseqatom() may swallow rest of branch */
      parseqatom(v, stopper, type, lp, right, t);
    }

    if (!seencontent) {       /* empty branch */
      if (!partial) {
          NOTE(REG_UUNSPEC);
      }
      assert(lp == left);
      EMPTYARC(left, right);
    }

    return t;
}

/*
 - parseqatom - parse one quantified atom or constraint of an RE
 * The bookkeeping near the end cooperates very closely with parsebranch(); in
 * particular, it contains a recursion that can involve parsing the rest of
 * the branch, making this function's name somewhat inaccurate.
 ^ static void parseqatom(struct vars *, int, int, struct state *,
 ^    struct state *, struct subre *);
 */
static void
parseqatom(
    struct vars *v,
    int stopper,        /* EOS or ')' */
    int type,                 /* LACON (lookahead subRE) or PLAIN */
    struct state *lp,         /* left state to hang it on */
    struct state *rp,         /* right state to hang it on */
    struct subre *top)        /* subtree top */
{
    struct state *s;          /* temporaries for new states */
    struct state *s2;
#define     ARCV(t, val)      newarc(v->nfa, t, val, lp, rp)
    int m, n;
    struct subre *atom;       /* atom's subtree */
    struct subre *t;
    int cap;                  /* capturing parens? */
    int pos;                  /* positive lookahead? */
    int subno;                /* capturing-parens or backref number */
    int atomtype;
    int qprefer;        /* quantifier short/long preference */
    int f;
    struct subre **atomp;     /* where the pointer to atom is */

    /*
     * Initial bookkeeping.
     */

    atom = NULL;
    assert(lp->nouts == 0);   /* must string new code */
    assert(rp->nins == 0);    /* between lp and rp */
    subno = 0;                /* just to shut lint up */

    /*
     * An atom or constraint...
     */

    atomtype = v->nexttype;
    switch (atomtype) {
      /* first, constraints, which end by returning */
    case '^':
      ARCV('^', 1);
      if (v->cflags&REG_NLANCH) {
          ARCV(BEHIND, v->nlcolor);
      }
      NEXT();
      return;
    case '$':
      ARCV('$', 1);
      if (v->cflags&REG_NLANCH) {
          ARCV(AHEAD, v->nlcolor);
      }
      NEXT();
      return;
    case SBEGIN:
      ARCV('^', 1);           /* BOL */
      ARCV('^', 0);           /* or BOS */
      NEXT();
      return;
    case SEND:
      ARCV('$', 1);           /* EOL */
      ARCV('$', 0);           /* or EOS */
      NEXT();
      return;
    case '<':
      wordchrs(v);            /* does NEXT() */
      s = newstate(v->nfa);
      NOERR();
      nonword(v, BEHIND, lp, s);
      word(v, AHEAD, s, rp);
      return;
    case '>':
      wordchrs(v);            /* does NEXT() */
      s = newstate(v->nfa);
      NOERR();
      word(v, BEHIND, lp, s);
      nonword(v, AHEAD, s, rp);
      return;
    case WBDRY:
      wordchrs(v);            /* does NEXT() */
      s = newstate(v->nfa);
      NOERR();
      nonword(v, BEHIND, lp, s);
      word(v, AHEAD, s, rp);
      s = newstate(v->nfa);
      NOERR();
      word(v, BEHIND, lp, s);
      nonword(v, AHEAD, s, rp);
      return;
    case NWBDRY:
      wordchrs(v);            /* does NEXT() */
      s = newstate(v->nfa);
      NOERR();
      word(v, BEHIND, lp, s);
      word(v, AHEAD, s, rp);
      s = newstate(v->nfa);
      NOERR();
      nonword(v, BEHIND, lp, s);
      nonword(v, AHEAD, s, rp);
      return;
    case LACON:               /* lookahead constraint */
      pos = v->nextvalue;
      NEXT();
      s = newstate(v->nfa);
      s2 = newstate(v->nfa);
      NOERR();
      t = parse(v, ')', LACON, s, s2);
      freesubre(v, t);  /* internal structure irrelevant */
      assert(SEE(')') || ISERR());
      NEXT();
      n = newlacon(v, s, s2, pos);
      NOERR();
      ARCV(LACON, n);
      return;

      /*
       * Then errors, to get them out of the way.
       */

    case '*':
    case '+':
    case '?':
    case '{':
      ERR(REG_BADRPT);
      return;
    default:
      ERR(REG_ASSERT);
      return;

      /*
       * Then plain characters, and minor variants on that theme.
       */

    case ')':                 /* unbalanced paren */
      if ((v->cflags&REG_ADVANCED) != REG_EXTENDED) {
          ERR(REG_EPAREN);
          return;
      }

      /*
       * Legal in EREs due to specification botch.
       */

      NOTE(REG_UPBOTCH);
      /* fallthrough into case PLAIN */
    case PLAIN:
      onechr(v, v->nextvalue, lp, rp);
      okcolors(v->nfa, v->cm);
      NOERR();
      NEXT();
      break;
    case '[':
      if (v->nextvalue == 1) {
          bracket(v, lp, rp);
      } else {
          cbracket(v, lp, rp);
      }
      assert(SEE(']') || ISERR());
      NEXT();
      break;
    case '.':
      rainbow(v->nfa, v->cm, PLAIN,
            (v->cflags&REG_NLSTOP) ? v->nlcolor : COLORLESS, lp, rp);
      NEXT();
      break;

      /*
       * And finally the ugly stuff.
       */

    case '(':                 /* value flags as capturing or non */
      cap = (type == LACON) ? 0 : v->nextvalue;
      if (cap) {
          v->nsubexp++;
          subno = v->nsubexp;
          if ((size_t)subno >= v->nsubs) {
            moresubs(v, subno);
          }
          assert((size_t)subno < v->nsubs);
      } else {
          atomtype = PLAIN;   /* something that's not '(' */
      }
      NEXT();

      /*
       * Need new endpoints because tree will contain pointers.
       */

      s = newstate(v->nfa);
      s2 = newstate(v->nfa);
      NOERR();
      EMPTYARC(lp, s);
      EMPTYARC(s2, rp);
      NOERR();
      atom = parse(v, ')', PLAIN, s, s2);
      assert(SEE(')') || ISERR());
      NEXT();
      NOERR();
      if (cap) {
          v->subs[subno] = atom;
          t = subre(v, '(', atom->flags|CAP, lp, rp);
          NOERR();
          t->subno = subno;
          t->left = atom;
          atom = t;
      }

      /*
       * Postpone everything else pending possible {0}.
       */

      break;
    case BACKREF:       /* the Feature From The Black Lagoon */
      INSIST(type != LACON, REG_ESUBREG);
      INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
      INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
      NOERR();
      assert(v->nextvalue > 0);
      atom = subre(v, 'b', BACKR, lp, rp);
      subno = v->nextvalue;
      atom->subno = subno;
      EMPTYARC(lp, rp); /* temporarily, so there's something */
      NEXT();
      break;
    }

    /*
     * ...and an atom may be followed by a quantifier.
     */

    switch (v->nexttype) {
    case '*':
      m = 0;
      n = INFINITY;
      qprefer = (v->nextvalue) ? LONGER : SHORTER;
      NEXT();
      break;
    case '+':
      m = 1;
      n = INFINITY;
      qprefer = (v->nextvalue) ? LONGER : SHORTER;
      NEXT();
      break;
    case '?':
      m = 0;
      n = 1;
      qprefer = (v->nextvalue) ? LONGER : SHORTER;
      NEXT();
      break;
    case '{':
      NEXT();
      m = scannum(v);
      if (EAT(',')) {
          if (SEE(DIGIT)) {
            n = scannum(v);
          } else {
            n = INFINITY;
          }
          if (m > n) {
            ERR(REG_BADBR);
            return;
          }

          /*
           * {m,n} exercises preference, even if it's {m,m}
           */

          qprefer = (v->nextvalue) ? LONGER : SHORTER;
      } else {
          n = m;
          /*
           * {m} passes operand's preference through.
           */

          qprefer = 0;
      }
      if (!SEE('}')) {  /* catches errors too */
          ERR(REG_BADBR);
          return;
      }
      NEXT();
      break;
    default:                  /* no quantifier */
      m = n = 1;
      qprefer = 0;
      break;
    }

    /*
     * Annoying special case: {0} or {0,0} cancels everything.
     */

    if (m == 0 && n == 0) {
      if (atom != NULL) {
          freesubre(v, atom);
      }
      if (atomtype == '(') {
          v->subs[subno] = NULL;
      }
      delsub(v->nfa, lp, rp);
      EMPTYARC(lp, rp);
      return;
    }

    /*
     * If not a messy case, avoid hard part.
     */

    assert(!MESSY(top->flags));
    f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0);
    if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f))) {
      if (!(m == 1 && n == 1)) {
          repeat(v, lp, rp, m, n);
      }
      if (atom != NULL) {
          freesubre(v, atom);
      }
      top->flags = f;
      return;
    }

    /*
     * hard part: something messy
     * That is, capturing parens, back reference, short/long clash, or an atom
     * with substructure containing one of those.
     */

    /*
     * Now we'll need a subre for the contents even if they're boring.
     */

    if (atom == NULL) {
      atom = subre(v, '=', 0, lp, rp);
      NOERR();
    }

    /*
     * Prepare a general-purpose state skeleton.
     *
     *    ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
     *   /                                            /
     * [lp] ----> [s2] ----bypass---------------------
     *
     * where bypass is an empty, and prefix is some repetitions of atom
     */

    s = newstate(v->nfa);     /* first, new endpoints for the atom */
    s2 = newstate(v->nfa);
    NOERR();
    moveouts(v->nfa, lp, s);
    moveins(v->nfa, rp, s2);
    NOERR();
    atom->begin = s;
    atom->end = s2;
    s = newstate(v->nfa);     /* and spots for prefix and bypass */
    s2 = newstate(v->nfa);
    NOERR();
    EMPTYARC(lp, s);
    EMPTYARC(lp, s2);
    NOERR();

    /*
     * Break remaining subRE into x{...} and what follows.
     */

    t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);
    t->left = atom;
    atomp = &t->left;

    /*
     * Here we should recurse... but we must postpone that to the end.
     */

    /*
     * Split top into prefix and remaining.
     */

    assert(top->op == '=' && top->left == NULL && top->right == NULL);
    top->left = subre(v, '=', top->flags, top->begin, lp);
    top->op = '.';
    top->right = t;

    /*
     * If it's a backref, now is the time to replicate the subNFA.
     */

    if (atomtype == BACKREF) {
      assert(atom->begin->nouts == 1);    /* just the EMPTY */
      delsub(v->nfa, atom->begin, atom->end);
      assert(v->subs[subno] != NULL);

      /*
       * And here's why the recursion got postponed: it must wait until the
       * skeleton is filled in, because it may hit a backref that wants to
       * copy the filled-in skeleton.
       */

      dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
            atom->begin, atom->end);
      NOERR();
    }

    /*
     * It's quantifier time; first, turn x{0,...} into x{1,...}|empty
     */

    if (m == 0) {
      EMPTYARC(s2, atom->end);/* the bypass */
      assert(PREF(qprefer) != 0);
      f = COMBINE(qprefer, atom->flags);
      t = subre(v, '|', f, lp, atom->end);
      NOERR();
      t->left = atom;
      t->right = subre(v, '|', PREF(f), s2, atom->end);
      NOERR();
      t->right->left = subre(v, '=', 0, s2, atom->end);
      NOERR();
      *atomp = t;
      atomp = &t->left;
      m = 1;
    }

    /*
     * Deal with the rest of the quantifier.
     */

    if (atomtype == BACKREF) {
      /*
       * Special case: backrefs have internal quantifiers.
       */

      EMPTYARC(s, atom->begin);     /* empty prefix */

      /*
       * Just stuff everything into atom.
       */

      repeat(v, atom->begin, atom->end, m, n);
      atom->min = (short) m;
      atom->max = (short) n;
      atom->flags |= COMBINE(qprefer, atom->flags);
    } else if (m == 1 && n == 1) {
      /*
       * No/vacuous quantifier: done.
       */

      EMPTYARC(s, atom->begin);     /* empty prefix */
    } else {
      /*
       * Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only second
       * x
       */

      dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
      assert(m >= 1 && m != INFINITY && n >= 1);
      repeat(v, s, atom->begin, m-1, (n == INFINITY) ? n : n-1);
      f = COMBINE(qprefer, atom->flags);
      t = subre(v, '.', f, s, atom->end); /* prefix and atom */
      NOERR();
      t->left = subre(v, '=', PREF(f), s, atom->begin);
      NOERR();
      t->right = atom;
      *atomp = t;
    }

    /*
     * And finally, look after that postponed recursion.
     */

    t = top->right;
    if (!(SEE('|') || SEE(stopper) || SEE(EOS))) {
      t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
    } else {
      EMPTYARC(atom->end, rp);
      t->right = subre(v, '=', 0, atom->end, rp);
    }
    assert(SEE('|') || SEE(stopper) || SEE(EOS));
    t->flags |= COMBINE(t->flags, t->right->flags);
    top->flags |= COMBINE(top->flags, t->flags);
}

/*
 - nonword - generate arcs for non-word-character ahead or behind
 ^ static void nonword(struct vars *, int, struct state *, struct state *);
 */
static void
nonword(
    struct vars *v,
    int dir,                  /* AHEAD or BEHIND */
    struct state *lp,
    struct state *rp)
{
    int anchor = (dir == AHEAD) ? '$' : '^';

    assert(dir == AHEAD || dir == BEHIND);
    newarc(v->nfa, anchor, 1, lp, rp);
    newarc(v->nfa, anchor, 0, lp, rp);
    colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp);
    /* (no need for special attention to \n) */
}

/*
 - word - generate arcs for word character ahead or behind
 ^ static void word(struct vars *, int, struct state *, struct state *);
 */
static void
word(
    struct vars *v,
    int dir,                  /* AHEAD or BEHIND */
    struct state *lp,
    struct state *rp)
{
    assert(dir == AHEAD || dir == BEHIND);
    cloneouts(v->nfa, v->wordchrs, lp, rp, dir);
    /* (no need for special attention to \n) */
}

/*
 - scannum - scan a number
 ^ static int scannum(struct vars *);
 */
static int              /* value, <= DUPMAX */
scannum(
    struct vars *v)
{
    int n = 0;

    while (SEE(DIGIT) && n < DUPMAX) {
      n = n*10 + v->nextvalue;
      NEXT();
    }
    if (SEE(DIGIT) || n > DUPMAX) {
      ERR(REG_BADBR);
      return 0;
    }
    return n;
}

/*
 - repeat - replicate subNFA for quantifiers
 * The duplication sequences used here are chosen carefully so that any
 * pointers starting out pointing into the subexpression end up pointing into
 * the last occurrence. (Note that it may not be strung between the same left
 * and right end states, however!) This used to be important for the subRE
 * tree, although the important bits are now handled by the in-line code in
 * parse(), and when this is called, it doesn't matter any more.
 ^ static void repeat(struct vars *, struct state *, struct state *, int, int);
 */
static void
repeat(
    struct vars *v,
    struct state *lp,
    struct state *rp,
    int m,
    int n)
{
#define     SOME        2
#define     INF         3
#define     PAIR(x, y)  ((x)*4 + (y))
#define     REDUCE(x)   ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
    const int rm = REDUCE(m);
    const int rn = REDUCE(n);
    struct state *s, *s2;

    switch (PAIR(rm, rn)) {
    case PAIR(0, 0):          /* empty string */
      delsub(v->nfa, lp, rp);
      EMPTYARC(lp, rp);
      break;
    case PAIR(0, 1):          /* do as x| */
      EMPTYARC(lp, rp);
      break;
    case PAIR(0, SOME):       /* do as x{1,n}| */
      repeat(v, lp, rp, 1, n);
      NOERR();
      EMPTYARC(lp, rp);
      break;
    case PAIR(0, INF):        /* loop x around */
      s = newstate(v->nfa);
      NOERR();
      moveouts(v->nfa, lp, s);
      moveins(v->nfa, rp, s);
      EMPTYARC(lp, s);
      EMPTYARC(s, rp);
      break;
    case PAIR(1, 1):          /* no action required */
      break;
    case PAIR(1, SOME):       /* do as x{0,n-1}x = (x{1,n-1}|)x */
      s = newstate(v->nfa);
      NOERR();
      moveouts(v->nfa, lp, s);
      dupnfa(v->nfa, s, rp, lp, s);
      NOERR();
      repeat(v, lp, s, 1, n-1);
      NOERR();
      EMPTYARC(lp, s);
      break;
    case PAIR(1, INF):        /* add loopback arc */
      s = newstate(v->nfa);
      s2 = newstate(v->nfa);
      NOERR();
      moveouts(v->nfa, lp, s);
      moveins(v->nfa, rp, s2);
      EMPTYARC(lp, s);
      EMPTYARC(s2, rp);
      EMPTYARC(s2, s);
      break;
    case PAIR(SOME, SOME):          /* do as x{m-1,n-1}x */
      s = newstate(v->nfa);
      NOERR();
      moveouts(v->nfa, lp, s);
      dupnfa(v->nfa, s, rp, lp, s);
      NOERR();
      repeat(v, lp, s, m-1, n-1);
      break;
    case PAIR(SOME, INF):           /* do as x{m-1,}x */
      s = newstate(v->nfa);
      NOERR();
      moveouts(v->nfa, lp, s);
      dupnfa(v->nfa, s, rp, lp, s);
      NOERR();
      repeat(v, lp, s, m-1, n);
      break;
    default:
      ERR(REG_ASSERT);
      break;
    }
}

/*
 - bracket - handle non-complemented bracket expression
 * Also called from cbracket for complemented bracket expressions.
 ^ static void bracket(struct vars *, struct state *, struct state *);
 */
static void
bracket(
    struct vars *v,
    struct state *lp,
    struct state *rp)
{
    assert(SEE('['));
    NEXT();
    while (!SEE(']') && !SEE(EOS)) {
      brackpart(v, lp, rp);
    }
    assert(SEE(']') || ISERR());
    okcolors(v->nfa, v->cm);
}

/*
 - cbracket - handle complemented bracket expression
 * We do it by calling bracket() with dummy endpoints, and then complementing
 * the result. The alternative would be to invoke rainbow(), and then delete
 * arcs as the b.e. is seen... but that gets messy.
 ^ static void cbracket(struct vars *, struct state *, struct state *);
 */
static void
cbracket(
    struct vars *v,
    struct state *lp,
    struct state *rp)
{
    struct state *left = newstate(v->nfa);
    struct state *right = newstate(v->nfa);

    NOERR();
    bracket(v, left, right);
    if (v->cflags&REG_NLSTOP) {
      newarc(v->nfa, PLAIN, v->nlcolor, left, right);
    }
    NOERR();

    assert(lp->nouts == 0);   /* all outarcs will be ours */

    /*
     * Easy part of complementing, and all there is to do since the MCCE code
     * was removed.
     */

    colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp);
    NOERR();
    dropstate(v->nfa, left);
    assert(right->nins == 0);
    freestate(v->nfa, right);
    return;
}

/*
 - brackpart - handle one item (or range) within a bracket expression
 ^ static void brackpart(struct vars *, struct state *, struct state *);
 */
static void
brackpart(
    struct vars *v,
    struct state *lp,
    struct state *rp)
{
    celt startc, endc;
    struct cvec *cv;
    const chr *startp, *endp;
    chr c[1];

    /*
     * Parse something, get rid of special cases, take shortcuts.
     */

    switch (v->nexttype) {
    case RANGE:               /* a-b-c or other botch */
      ERR(REG_ERANGE);
      return;
      break;
    case PLAIN:
      c[0] = v->nextvalue;
      NEXT();

      /*
       * Shortcut for ordinary chr (not range).
       */

      if (!SEE(RANGE)) {
          onechr(v, c[0], lp, rp);
          return;
      }
      startc = element(v, c, c+1);
      NOERR();
      break;
    case COLLEL:
      startp = v->now;
      endp = scanplain(v);
      INSIST(startp < endp, REG_ECOLLATE);
      NOERR();
      startc = element(v, startp, endp);
      NOERR();
      break;
    case ECLASS:
      startp = v->now;
      endp = scanplain(v);
      INSIST(startp < endp, REG_ECOLLATE);
      NOERR();
      startc = element(v, startp, endp);
      NOERR();
      cv = eclass(v, startc, (v->cflags&REG_ICASE));
      NOERR();
      dovec(v, cv, lp, rp);
      return;
      break;
    case CCLASS:
      startp = v->now;
      endp = scanplain(v);
      INSIST(startp < endp, REG_ECTYPE);
      NOERR();
      cv = cclass(v, startp, endp, (v->cflags&REG_ICASE));
      NOERR();
      dovec(v, cv, lp, rp);
      return;
      break;
    default:
      ERR(REG_ASSERT);
      return;
      break;
    }

    if (SEE(RANGE)) {
      NEXT();
      switch (v->nexttype) {
      case PLAIN:
      case RANGE:
          c[0] = v->nextvalue;
          NEXT();
          endc = element(v, c, c+1);
          NOERR();
          break;
      case COLLEL:
          startp = v->now;
          endp = scanplain(v);
          INSIST(startp < endp, REG_ECOLLATE);
          NOERR();
          endc = element(v, startp, endp);
          NOERR();
          break;
      default:
          ERR(REG_ERANGE);
          return;
          break;
      }
    } else {
      endc = startc;
    }

    /*
     * Ranges are unportable. Actually, standard C does guarantee that digits
     * are contiguous, but making that an exception is just too complicated.
     */

    if (startc != endc) {
      NOTE(REG_UUNPORT);
    }
    cv = range(v, startc, endc, (v->cflags&REG_ICASE));
    NOERR();
    dovec(v, cv, lp, rp);
}

/*
 - scanplain - scan PLAIN contents of [. etc.
 * Certain bits of trickery in lex.c know that this code does not try to look
 * past the final bracket of the [. etc.
 ^ static const chr *scanplain(struct vars *);
 */
static const chr *            /* just after end of sequence */
scanplain(
    struct vars *v)
{
    const chr *endp;

    assert(SEE(COLLEL) || SEE(ECLASS) || SEE(CCLASS));
    NEXT();

    endp = v->now;
    while (SEE(PLAIN)) {
      endp = v->now;
      NEXT();
    }

    assert(SEE(END) || ISERR());
    NEXT();

    return endp;
}

/*
 - onechr - fill in arcs for a plain character, and possible case complements
 * This is mostly a shortcut for efficient handling of the common case.
 ^ static void onechr(struct vars *, pchr, struct state *, struct state *);
 */
static void
onechr(
    struct vars *v,
    pchr c,
    struct state *lp,
    struct state *rp)
{
    if (!(v->cflags&REG_ICASE)) {
      newarc(v->nfa, PLAIN, subcolor(v->cm, c), lp, rp);
      return;
    }

    /*
     * Rats, need general case anyway...
     */

    dovec(v, allcases(v, c), lp, rp);
}

/*
 - dovec - fill in arcs for each element of a cvec
 ^ static void dovec(struct vars *, struct cvec *, struct state *,
 ^    struct state *);
 */
static void
dovec(
    struct vars *v,
    struct cvec *cv,
    struct state *lp,
    struct state *rp)
{
    chr ch, from, to;
    const chr *p;
    int i;

    for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--) {
      ch = *p;
      newarc(v->nfa, PLAIN, subcolor(v->cm, ch), lp, rp);
    }

    for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--) {
      from = *p;
      to = *(p+1);
      if (from <= to) {
          subrange(v, from, to, lp, rp);
      }
    }

}

/*
 - wordchrs - set up word-chr list for word-boundary stuff, if needed
 * The list is kept as a bunch of arcs between two dummy states; it's disposed
 * of by the unreachable-states sweep in NFA optimization. Does NEXT(). Must
 * not be called from any unusual lexical context. This should be reconciled
 * with the \w etc. handling in lex.c, and should be cleaned up to reduce
 * dependencies on input scanning.
 ^ static void wordchrs(struct vars *);
 */
static void
wordchrs(
    struct vars *v)
{
    struct state *left, *right;

    if (v->wordchrs != NULL) {
      NEXT();           /* for consistency */
      return;
    }

    left = newstate(v->nfa);
    right = newstate(v->nfa);
    NOERR();

    /*
     * Fine point: implemented with [::], and lexer will set REG_ULOCALE.
     */

    lexword(v);
    NEXT();
    assert(v->savenow != NULL && SEE('['));
    bracket(v, left, right);
    assert((v->savenow != NULL && SEE(']')) || ISERR());
    NEXT();
    NOERR();
    v->wordchrs = left;
}

/*
 - subre - allocate a subre
 ^ static struct subre *subre(struct vars *, int, int, struct state *,
 ^    struct state *);
 */
static struct subre *
subre(
    struct vars *v,
    int op,
    int flags,
    struct state *begin,
    struct state *end)
{
    struct subre *ret = v->treefree;

    if (ret != NULL) {
      v->treefree = ret->left;
    } else {
      ret = (struct subre *) MALLOC(sizeof(struct subre));
      if (ret == NULL) {
          ERR(REG_ESPACE);
          return NULL;
      }
      ret->chain = v->treechain;
      v->treechain = ret;
    }

    assert(strchr("|.b(=", op) != NULL);

    ret->op = op;
    ret->flags = flags;
    ret->retry = 0;
    ret->subno = 0;
    ret->min = ret->max = 1;
    ret->left = NULL;
    ret->right = NULL;
    ret->begin = begin;
    ret->end = end;
    ZAPCNFA(ret->cnfa);

    return ret;
}

/*
 - freesubre - free a subRE subtree
 ^ static void freesubre(struct vars *, struct subre *);
 */
static void
freesubre(
    struct vars *v,           /* might be NULL */
    struct subre *sr)
{
    if (sr == NULL) {
      return;
    }

    if (sr->left != NULL) {
      freesubre(v, sr->left);
    }
    if (sr->right != NULL) {
      freesubre(v, sr->right);
    }

    freesrnode(v, sr);
}

/*
 - freesrnode - free one node in a subRE subtree
 ^ static void freesrnode(struct vars *, struct subre *);
 */
static void
freesrnode(
    struct vars *v,           /* might be NULL */
    struct subre *sr)
{
    if (sr == NULL) {
      return;
    }

    if (!NULLCNFA(sr->cnfa)) {
      freecnfa(&sr->cnfa);
    }
    sr->flags = 0;

    if (v != NULL) {
      sr->left = v->treefree;
      v->treefree = sr;
    } else {
      FREE(sr);
    }
}

/*
 - optst - optimize a subRE subtree
 ^ static void optst(struct vars *, struct subre *);
 */
static void
optst(
    struct vars *v,
    struct subre *t)
{
    /*
     * DGP (2007-11-13): I assume it was the programmer's intent to eventually
     * come back and add code to optimize subRE trees, but the routine coded
     * just spends effort traversing the tree and doing nothing. We can do
     * nothing with less effort.
     */

    return;
}

/*
 - numst - number tree nodes (assigning retry indexes)
 ^ static int numst(struct subre *, int);
 */
static int              /* next number */
numst(
    struct subre *t,
    int start)                /* starting point for subtree numbers */
{
    int i;

    assert(t != NULL);

    i = start;
    t->retry = (short) i++;
    if (t->left != NULL) {
      i = numst(t->left, i);
    }
    if (t->right != NULL) {
      i = numst(t->right, i);
    }
    return i;
}

/*
 - markst - mark tree nodes as INUSE
 ^ static void markst(struct subre *);
 */
static void
markst(
    struct subre *t)
{
    assert(t != NULL);

    t->flags |= INUSE;
    if (t->left != NULL) {
      markst(t->left);
    }
    if (t->right != NULL) {
      markst(t->right);
    }
}

/*
 - cleanst - free any tree nodes not marked INUSE
 ^ static void cleanst(struct vars *);
 */
static void
cleanst(
    struct vars *v)
{
    struct subre *t;
    struct subre *next;

    for (t = v->treechain; t != NULL; t = next) {
      next = t->chain;
      if (!(t->flags&INUSE)) {
          FREE(t);
      }
    }
    v->treechain = NULL;
    v->treefree = NULL;       /* just on general principles */
}

/*
 - nfatree - turn a subRE subtree into a tree of compacted NFAs
 ^ static long nfatree(struct vars *, struct subre *, FILE *);
 */
static long             /* optimize results from top node */
nfatree(
    struct vars *v,
    struct subre *t,
    FILE *f)                  /* for debug output */
{
    assert(t != NULL && t->begin != NULL);

    if (t->left != NULL) {
      (DISCARD) nfatree(v, t->left, f);
    }
    if (t->right != NULL) {
      (DISCARD) nfatree(v, t->right, f);
    }

    return nfanode(v, t, f);
}

/*
 - nfanode - do one NFA for nfatree
 ^ static long nfanode(struct vars *, struct subre *, FILE *);
 */
static long             /* optimize results */
nfanode(
    struct vars *v,
    struct subre *t,
    FILE *f)                  /* for debug output */
{
    struct nfa *nfa;
    long ret = 0;
    char idbuf[50];

    assert(t->begin != NULL);

    if (f != NULL) {
      fprintf(f, "\n\n\n========= TREE NODE %s ==========\n",
            stid(t, idbuf, sizeof(idbuf)));
    }
    nfa = newnfa(v, v->cm, v->nfa);
    NOERRZ();
    dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
    if (!ISERR()) {
      specialcolors(nfa);
      ret = optimize(nfa, f);
    }
    if (!ISERR()) {
      compact(nfa, &t->cnfa);
    }

    freenfa(nfa);
    return ret;
}

/*
 - newlacon - allocate a lookahead-constraint subRE
 ^ static int newlacon(struct vars *, struct state *, struct state *, int);
 */
static int              /* lacon number */
newlacon(
    struct vars *v,
    struct state *begin,
    struct state *end,
    int pos)
{
    struct subre *sub;
    int n;

    if (v->nlacons == 0) {
      v->lacons = (struct subre *) MALLOC(2 * sizeof(struct subre));
      n = 1;            /* skip 0th */
      v->nlacons = 2;
    } else {
      v->lacons = (struct subre *) REALLOC(v->lacons,
            (v->nlacons+1)*sizeof(struct subre));
      n = v->nlacons++;
    }

    if (v->lacons == NULL) {
      ERR(REG_ESPACE);
      return 0;
    }

    sub = &v->lacons[n];
    sub->begin = begin;
    sub->end = end;
    sub->subno = pos;
    ZAPCNFA(sub->cnfa);
    return n;
}

/*
 - freelacons - free lookahead-constraint subRE vector
 ^ static void freelacons(struct subre *, int);
 */
static void
freelacons(
    struct subre *subs,
    int n)
{
    struct subre *sub;
    int i;

    assert(n > 0);
    for (sub=subs+1, i=n-1; i>0; sub++, i--) {  /* no 0th */
      if (!NULLCNFA(sub->cnfa)) {
          freecnfa(&sub->cnfa);
      }
    }
    FREE(subs);
}

/*
 - rfree - free a whole RE (insides of regfree)
 ^ static void rfree(regex_t *);
 */
static void
rfree(
    regex_t *re)
{
    struct guts *g;

    if (re == NULL || re->re_magic != REMAGIC) {
      return;
    }

    re->re_magic = 0;   /* invalidate RE */
    g = (struct guts *) re->re_guts;
    re->re_guts = NULL;
    re->re_fns = NULL;
    g->magic = 0;
    freecm(&g->cmap);
    if (g->tree != NULL) {
      freesubre(NULL, g->tree);
    }
    if (g->lacons != NULL) {
      freelacons(g->lacons, g->nlacons);
    }
    if (!NULLCNFA(g->search)) {
      freecnfa(&g->search);
    }
    FREE(g);
}

/*
 - dump - dump an RE in human-readable form
 ^ static void dump(regex_t *, FILE *);
 */
static void
dump(
    regex_t *re,
    FILE *f)
{
#ifdef REG_DEBUG
    struct guts *g;
    int i;

    if (re->re_magic != REMAGIC) {
      fprintf(f, "bad magic number (0x%x not 0x%x)\n",
            re->re_magic, REMAGIC);
    }
    if (re->re_guts == NULL) {
      fprintf(f, "NULL guts!!!\n");
      return;
    }
    g = (struct guts *) re->re_guts;
    if (g->magic != GUTSMAGIC) {
      fprintf(f, "bad guts magic number (0x%x not 0x%x)\n",
            g->magic, GUTSMAGIC);
    }

    fprintf(f, "\n\n\n========= DUMP ==========\n");
    fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n",
          re->re_nsub, re->re_info, re->re_csize, g->ntree);

    dumpcolors(&g->cmap, f);
    if (!NULLCNFA(g->search)) {
      printf("\nsearch:\n");
      dumpcnfa(&g->search, f);
    }
    for (i = 1; i < g->nlacons; i++) {
      fprintf(f, "\nla%d (%s):\n", i,
            (g->lacons[i].subno) ? "positive" : "negative");
      dumpcnfa(&g->lacons[i].cnfa, f);
    }
    fprintf(f, "\n");
    dumpst(g->tree, f, 0);
#endif
}

/*
 - dumpst - dump a subRE tree
 ^ static void dumpst(struct subre *, FILE *, int);
 */
static void
dumpst(
    struct subre *t,
    FILE *f,
    int nfapresent)           /* is the original NFA still around? */
{
    if (t == NULL) {
      fprintf(f, "null tree\n");
    } else {
      stdump(t, f, nfapresent);
    }
    fflush(f);
}

/*
 - stdump - recursive guts of dumpst
 ^ static void stdump(struct subre *, FILE *, int);
 */
static void
stdump(
    struct subre *t,
    FILE *f,
    int nfapresent)           /* is the original NFA still around? */
{
    char idbuf[50];

    fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op);
    if (t->flags&LONGER) {
      fprintf(f, " longest");
    }
    if (t->flags&SHORTER) {
      fprintf(f, " shortest");
    }
    if (t->flags&MIXED) {
      fprintf(f, " hasmixed");
    }
    if (t->flags&CAP) {
      fprintf(f, " hascapture");
    }
    if (t->flags&BACKR) {
      fprintf(f, " hasbackref");
    }
    if (!(t->flags&INUSE)) {
      fprintf(f, " UNUSED");
    }
    if (t->subno != 0) {
      fprintf(f, " (#%d)", t->subno);
    }
    if (t->min != 1 || t->max != 1) {
      fprintf(f, " {%d,", t->min);
      if (t->max != INFINITY) {
          fprintf(f, "%d", t->max);
      }
      fprintf(f, "}");
    }
    if (nfapresent) {
      fprintf(f, " %ld-%ld", (long)t->begin->no, (long)t->end->no);
    }
    if (t->left != NULL) {
      fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf)));
    }
    if (t->right != NULL) {
      fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf)));
    }
    if (!NULLCNFA(t->cnfa)) {
      fprintf(f, "\n");
      dumpcnfa(&t->cnfa, f);
    }
    fprintf(f, "\n");
    if (t->left != NULL) {
      stdump(t->left, f, nfapresent);
    }
    if (t->right != NULL) {
      stdump(t->right, f, nfapresent);
    }
}

/*
 - stid - identify a subtree node for dumping
 ^ static char *stid(struct subre *, char *, size_t);
 */
static const char *                 /* points to buf or constant string */
stid(
    struct subre *t,
    char *buf,
    size_t bufsize)
{
    /*
     * Big enough for hex int or decimal t->retry?
     */

    if (bufsize < sizeof(void*)*2 + 3 || bufsize < sizeof(t->retry)*3 + 1) {
      return "unable";
    }
    if (t->retry != 0) {
      sprintf(buf, "%d", t->retry);
    } else {
      sprintf(buf, "%p", t);
    }
    return buf;
}

#include "regc_lex.c"
#include "regc_color.c"
#include "regc_nfa.c"
#include "regc_cvec.c"
#include "regc_locale.c"

/*
 * Local Variables:
 * mode: c
 * c-basic-offset: 4
 * fill-column: 78
 * End:
 */

Generated by  Doxygen 1.6.0   Back to index